bitkeeper revision 1.1385.1.7 (427f6405sUeICnIzUJ_HaXbYnLds4A)
authormafetter@fleming.research <mafetter@fleming.research>
Mon, 9 May 2005 13:22:13 +0000 (13:22 +0000)
committermafetter@fleming.research <mafetter@fleming.research>
Mon, 9 May 2005 13:22:13 +0000 (13:22 +0000)
Enabling light-weight shadows (especially shadow_mode_dirty).

Light-weight shadows leave all the page ref counts based on the guest p.t. pages,
while heavy-weight shadows do all their ref counts based on the shadow's p.t. pages.
shadow_mode_refcounts(dom) == 1 implies heavy-weight shadows.

13 files changed:
xen/arch/x86/audit.c
xen/arch/x86/domain.c
xen/arch/x86/domain_build.c
xen/arch/x86/mm.c
xen/arch/x86/shadow.c
xen/arch/x86/traps.c
xen/arch/x86/vmx.c
xen/include/asm-x86/mm.h
xen/include/asm-x86/page.h
xen/include/asm-x86/shadow.h
xen/include/asm-x86/x86_32/domain_page.h
xen/include/xen/lib.h
xen/include/xen/perfc_defn.h

index 2df4d69be8097681e5da109a42129fb87b8d8ed0..b7e874c62d6e9510315a3bee4a1bb3a08864a492 100644 (file)
@@ -49,7 +49,8 @@ static int l1, l2, oos_count, page_count;
 int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
 {
     int errors = 0;
-    int shadow_enabled = shadow_mode_enabled(d) ? 1 : 0;
+    int shadow_refcounts = !!shadow_mode_refcounts(d);
+    int shadow_enabled = !!shadow_mode_enabled(d);
     int l2limit;
 
     void _adjust(struct pfn_info *page, int adjtype ADJUST_EXTRA_ARGS)
@@ -119,7 +120,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
             page->count_info += dir;
     }
 
-    void adjust_l2_page(unsigned long mfn)
+    void adjust_l2_page(unsigned long mfn, int shadow)
     {
         unsigned long *pt = map_domain_mem(mfn << PAGE_SHIFT);
         int i;
@@ -133,7 +134,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
 
                 if ( noisy )
                 {
-                    if ( shadow_enabled )
+                    if ( shadow )
                     {
                         if ( page_get_owner(l1page) != NULL )
                         {
@@ -145,6 +146,17 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
                             errors++;
                             continue;
                         }
+
+                        u32 page_type = l1page->u.inuse.type_info & PGT_type_mask;
+
+                        if ( page_type != PGT_l1_shadow )
+                        {
+                            printk("Audit %d: [Shadow L2 mfn=%lx i=%x] "
+                                   "Expected Shadow L1 t=%x mfn=%lx\n",
+                                   d->id, mfn, i,
+                                   l1page->u.inuse.type_info, l1mfn);
+                            errors++;
+                        }
                     }
                     else
                     {
@@ -154,7 +166,9 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
                                    "belonging to other dom %p (id=%d)\n",
                                    l1mfn,
                                    page_get_owner(l1page),
-                                   page_get_owner(l1page)->id);
+                                   (page_get_owner(l1page)
+                                    ? page_get_owner(l1page)->id
+                                    : -1));
                             errors++;
                             continue;
                         }
@@ -179,7 +193,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
                     }
                 }
 
-                adjust(l1page, !shadow_enabled);
+                adjust(l1page, !shadow);
             }
         }
 
@@ -280,7 +294,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
                             errors++;
                         }
 
-                        if ( shadow_enabled &&
+                        if ( shadow_refcounts &&
                              page_is_page_table(gpage) &&
                              ! page_out_of_sync(gpage) )
                         {
@@ -336,19 +350,21 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
                     break;
                 case PGT_l1_shadow:
                     adjust(pfn_to_page(gmfn), 0);
-                    adjust_l1_page(smfn);
+                    if ( shadow_refcounts )
+                        adjust_l1_page(smfn);
                     if ( page->u.inuse.type_info & PGT_pinned )
                         adjust(page, 0);
                     break;
                 case PGT_hl2_shadow:
                     adjust(pfn_to_page(gmfn), 0);
-                    adjust_hl2_page(smfn);
+                    if ( shadow_refcounts )
+                        adjust_hl2_page(smfn);
                     if ( page->u.inuse.type_info & PGT_pinned )
                         adjust(page, 0);
                     break;
                 case PGT_l2_shadow:
                     adjust(pfn_to_page(gmfn), 0);
-                    adjust_l2_page(smfn);
+                    adjust_l2_page(smfn, 1);
                     if ( page->u.inuse.type_info & PGT_pinned )
                         adjust(page, 0);
                     break;
@@ -391,45 +407,43 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
         struct exec_domain *ed;
 
         for_each_exec_domain(d, ed)
-            {
-                if ( !shadow_enabled )
-                {
-                    if ( pagetable_val(ed->arch.guest_table) )
-                        adjust(&frame_table[pagetable_val(ed->arch.guest_table)
-                                            >> PAGE_SHIFT], 1);
-                }
-                else
-                {
-                    if ( pagetable_val(ed->arch.guest_table) )
-                        adjust(&frame_table[pagetable_val(ed->arch.guest_table)
-                                            >> PAGE_SHIFT], 0);
-                    if ( pagetable_val(ed->arch.shadow_table) )
-                        adjust(&frame_table[pagetable_val(ed->arch.shadow_table)
-                                            >> PAGE_SHIFT], 0);
-                    if ( ed->arch.monitor_shadow_ref )
-                        adjust(&frame_table[ed->arch.monitor_shadow_ref], 0);
-                }
-            }
+        {
+            if ( pagetable_val(ed->arch.guest_table) )
+                adjust(&frame_table[pagetable_get_pfn(ed->arch.guest_table)], 1);
+            if ( pagetable_val(ed->arch.shadow_table) )
+                adjust(&frame_table[pagetable_get_pfn(ed->arch.shadow_table)], 0);
+            if ( ed->arch.monitor_shadow_ref )
+                adjust(&frame_table[ed->arch.monitor_shadow_ref], 0);
+        }
     }
 
     void adjust_guest_pages()
     {
         struct list_head *list_ent = d->page_list.next;
         struct pfn_info *page;
-        unsigned long mfn;
+        unsigned long mfn, snapshot_mfn;
 
         while ( list_ent != &d->page_list )
         {
             u32 page_type;
 
             page = list_entry(list_ent, struct pfn_info, list);
-            mfn = page_to_pfn(page);
+            snapshot_mfn = mfn = page_to_pfn(page);
             page_type = page->u.inuse.type_info & PGT_type_mask;
 
             BUG_ON(page_get_owner(page) != d);
 
             page_count++;
 
+            if ( shadow_enabled && !shadow_refcounts &&
+                 page_out_of_sync(page) )
+            {
+                unsigned long gpfn = __mfn_to_gpfn(d, mfn);
+                ASSERT( VALID_M2P(gpfn) );
+                snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
+                ASSERT( snapshot_mfn );
+            }
+
             switch ( page_type )
             {
             case PGT_l2_page_table:
@@ -437,7 +451,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
 
                 if ( noisy )
                 {
-                    if ( shadow_enabled )
+                    if ( shadow_refcounts )
                     {
                         printk("Audit %d: found an L2 guest page "
                                "mfn=%lx t=%08x c=%08x while in shadow mode\n",
@@ -446,19 +460,22 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
                         errors++;
                     }
 
-                    if ( (page->u.inuse.type_info & PGT_validated) !=
-                         PGT_validated )
+                    if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
                     {
-                        printk("Audit %d: L2 mfn=%lx not validated %08x\n",
-                               d->id, mfn, page->u.inuse.type_info);
-                        errors++;
-                    }
+                        if ( (page->u.inuse.type_info & PGT_validated) !=
+                             PGT_validated )
+                        {
+                            printk("Audit %d: L2 mfn=%lx not validated %08x\n",
+                                   d->id, mfn, page->u.inuse.type_info);
+                            errors++;
+                        }
 
-                    if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
-                    {
-                        printk("Audit %d: L2 mfn=%lx not pinned t=%08x\n",
-                               d->id, mfn, page->u.inuse.type_info);
-                        errors++;
+                        if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
+                        {
+                            printk("Audit %d: L2 mfn=%lx not pinned t=%08x\n",
+                                   d->id, mfn, page->u.inuse.type_info);
+                            errors++;
+                        }
                     }
                 }
 
@@ -466,7 +483,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
                     adjust(page, 1);
 
                 if ( page->u.inuse.type_info & PGT_validated )
-                    adjust_l2_page(mfn);
+                    adjust_l2_page(snapshot_mfn, 0);
 
                 break;
 
@@ -475,7 +492,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
 
                 if ( noisy )
                 {
-                    if ( shadow_enabled )
+                    if ( shadow_refcounts )
                     {
                         printk("found an L1 guest page mfn=%lx t=%08x c=%08x "
                                "while in shadow mode\n",
@@ -483,21 +500,24 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
                         errors++;
                     }
 
-                    if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
-                    {
-                        printk("Audit %d: L1 not validated mfn=%lx t=%08x\n",
-                               d->id, mfn, page->u.inuse.type_info);
-                        errors++;
-                    }
-
-                    if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
+                    if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
                     {
-                        if ( !VM_ASSIST(d, VMASST_TYPE_writable_pagetables) )
+                        if ( (page->u.inuse.type_info & PGT_validated) !=
+                             PGT_validated )
                         {
-                            printk("Audit %d: L1 mfn=%lx not pinned t=%08x\n",
+                            printk("Audit %d: L1 not validated mfn=%lx t=%08x\n",
                                    d->id, mfn, page->u.inuse.type_info);
                             errors++;
                         }
+
+                        if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
+                        {
+                            if ( !VM_ASSIST(d, VMASST_TYPE_writable_pagetables) )
+                            {
+                                printk("Audit %d: L1 mfn=%lx not pinned t=%08x\n",
+                                       d->id, mfn, page->u.inuse.type_info);
+                            }
+                        }
                     }
                 }
                 
@@ -505,7 +525,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
                     adjust(page, 1);
 
                 if ( page->u.inuse.type_info & PGT_validated )
-                    adjust_l1_page(mfn);
+                    adjust_l1_page(snapshot_mfn);
 
                 break;
 
@@ -520,7 +540,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
                 break;
 
             case PGT_writable_page:
-                if ( shadow_enabled )
+                if ( shadow_refcounts )
                 {
                     // In shadow mode, writable pages can get pinned by
                     // paravirtualized guests that think they are pinning
@@ -589,6 +609,8 @@ void audit_pagelist(struct domain *d)
 
 void _audit_domain(struct domain *d, int flags)
 {
+    int shadow_refcounts = !!shadow_mode_refcounts(d);
+
     void scan_for_pfn_in_mfn(struct domain *d, unsigned long xmfn,
                              unsigned long mfn)
     {
@@ -608,8 +630,29 @@ void _audit_domain(struct domain *d, int flags)
         unmap_domain_mem(pt);           
     }
 
+    void scan_for_pfn_in_grant_table(struct domain *d, unsigned xmfn)
+    {
+        int i;
+        active_grant_entry_t *act = d->grant_table->active;
+
+        spin_lock(&d->grant_table->lock);
+
+        for ( i = 0; i < NR_GRANT_ENTRIES; i++ )
+        {
+            if ( act[i].pin && (act[i].frame == xmfn) )
+            {
+                printk("     found active grant table entry i=%d dom=%d pin=%d\n",
+                       i, act[i].domid, act[i].pin);
+            }
+        }
+
+        spin_unlock(&d->grant_table->lock);
+    }
+
     void scan_for_pfn(struct domain *d, unsigned long xmfn)
     {
+        scan_for_pfn_in_grant_table(d, xmfn);
+
         if ( !shadow_mode_enabled(d) )
         {
             struct list_head *list_ent = d->page_list.next;
@@ -688,7 +731,7 @@ void _audit_domain(struct domain *d, int flags)
 
     // Maybe we should just be using BIGLOCK?
     //
-    if ( !(flags & AUDIT_ALREADY_LOCKED) )
+    if ( !(flags & AUDIT_SHADOW_ALREADY_LOCKED) )
         shadow_lock(d);
 
     spin_lock(&d->page_alloc_lock);
@@ -716,7 +759,7 @@ void _audit_domain(struct domain *d, int flags)
             errors++;
         }
 
-        if ( shadow_mode_enabled(d) &&
+        if ( shadow_mode_refcounts(d) &&
              (page_type == PGT_writable_page) &&
              !(page->u.inuse.type_info & PGT_validated) )
         {
@@ -764,7 +807,9 @@ void _audit_domain(struct domain *d, int flags)
                        mfn);
                 errors++;
             }
-            if ( page_type != PGT_writable_page )
+            if ( shadow_refcounts
+                 ? (page_type != PGT_writable_page)
+                 : !(page_type && (page_type <= PGT_l4_page_table)) )
             {
                 printk("out of sync page mfn=%lx has strange type "
                        "t=%08x c=%08x\n",
@@ -821,7 +866,7 @@ void _audit_domain(struct domain *d, int flags)
                        d->id, page->u.inuse.type_info, 
                        page->tlbflush_timestamp,
                        page->count_info, mfn);
-                errors++;
+                //errors++;
             }
             break;
         default:
@@ -835,7 +880,7 @@ void _audit_domain(struct domain *d, int flags)
                    page->count_info,
                    page->u.inuse.type_info, 
                    page->tlbflush_timestamp, mfn );
-            errors++;
+            //errors++;
             scan_for_pfn_remote(mfn);
         }
 
@@ -870,6 +915,8 @@ void _audit_domain(struct domain *d, int flags)
                                d->id, page_to_pfn(page),
                                page->u.inuse.type_info,
                                page->count_info);
+                        printk("a->gpfn_and_flags=%p\n",
+                               (void *)a->gpfn_and_flags);
                         errors++;
                     }
                     break;
@@ -905,7 +952,7 @@ void _audit_domain(struct domain *d, int flags)
                "pages=%d oos=%d l1=%d l2=%d ctot=%d ttot=%d\n",
                d->id, page_count, oos_count, l1, l2, ctot, ttot);
 
-    if ( !(flags & AUDIT_ALREADY_LOCKED) )
+    if ( !(flags & AUDIT_SHADOW_ALREADY_LOCKED) )
         shadow_unlock(d);
 
     if ( d != current->domain )
index a5029a45d5fe684b98c69e4c338c667a30742929..2003ecc6ed7271a25eb66723f9fd1f9350f9085b 100644 (file)
@@ -359,7 +359,8 @@ static int vmx_final_setup_guest(struct exec_domain *ed,
 
         /* Put the domain in shadow mode even though we're going to be using
          * the shared 1:1 page table initially. It shouldn't hurt */
-        shadow_mode_enable(ed->domain, SHM_enable|SHM_translate|SHM_external);
+        shadow_mode_enable(ed->domain,
+                           SHM_enable|SHM_refcounts|SHM_translate|SHM_external);
     }
 
     return 0;
@@ -450,7 +451,7 @@ int arch_set_info_guest(
     phys_basetab = c->pt_base;
     ed->arch.guest_table = mk_pagetable(phys_basetab);
 
-    if ( shadow_mode_enabled(d) )
+    if ( shadow_mode_refcounts(d) )
     {
         if ( !get_page(&frame_table[phys_basetab>>PAGE_SHIFT], d) )
             return -EINVAL;
@@ -991,17 +992,21 @@ void domain_relinquish_resources(struct domain *d)
     {
         if ( pagetable_val(ed->arch.guest_table) != 0 )
         {
-            (shadow_mode_enabled(d) ? put_page : put_page_and_type)
-                (&frame_table[pagetable_val(
-                    ed->arch.guest_table) >> PAGE_SHIFT]);
+            if ( shadow_mode_refcounts(d) )
+                put_page(&frame_table[pagetable_get_pfn(ed->arch.guest_table)]);
+            else
+                put_page_and_type(&frame_table[pagetable_get_pfn(ed->arch.guest_table)]);
+
             ed->arch.guest_table = mk_pagetable(0);
         }
 
         if ( pagetable_val(ed->arch.guest_table_user) != 0 )
         {
-            (shadow_mode_enabled(d) ? put_page : put_page_and_type)
-                (&frame_table[pagetable_val(
-                    ed->arch.guest_table_user) >> PAGE_SHIFT]);
+            if ( shadow_mode_refcounts(d) )
+                put_page(&frame_table[pagetable_get_pfn(ed->arch.guest_table_user)]);
+            else
+                put_page_and_type(&frame_table[pagetable_get_pfn(ed->arch.guest_table_user)]);
+
             ed->arch.guest_table_user = mk_pagetable(0);
         }
 
index f1488d6f081fa7638fe1e8c9fb92d76900122639..570fd1d33c9eb603feff7148c8f76891d097e89d 100644 (file)
@@ -546,7 +546,7 @@ int construct_dom0(struct domain *d,
     if ( opt_dom0_shadow || opt_dom0_translate )
     {
         shadow_mode_enable(d, (opt_dom0_translate
-                               ? SHM_enable | SHM_translate
+                               ? SHM_enable | SHM_refcounts | SHM_translate
                                : SHM_enable));
         if ( opt_dom0_translate )
         {
@@ -569,7 +569,7 @@ int construct_dom0(struct domain *d,
             idle_pg_table[1] = root_create_phys(pagetable_val(d->arch.phys_table),
                                                 __PAGE_HYPERVISOR);
             translate_l2pgtable(d, (l1_pgentry_t *)(1u << L2_PAGETABLE_SHIFT),
-                                pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT);
+                                pagetable_get_pfn(ed->arch.guest_table));
             idle_pg_table[1] = root_empty();
             local_flush_tlb();
         }
index 050a4c0d33516258b99a77df6c864bf7cf4355f2..107d2af5854e1ada0b517d06420bf3f987f79461 100644 (file)
@@ -316,7 +316,7 @@ int map_ldt_shadow_page(unsigned int off)
 
     res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page);
 
-    if ( !res && unlikely(shadow_mode_enabled(d)) )
+    if ( !res && unlikely(shadow_mode_refcounts(d)) )
     {
         shadow_lock(d);
         shadow_remove_all_write_access(d, gpfn, gmfn);
@@ -392,7 +392,7 @@ get_linear_pagetable(
     struct pfn_info *page;
     unsigned long pfn;
 
-    ASSERT( !shadow_mode_enabled(d) );
+    ASSERT( !shadow_mode_refcounts(d) );
 
     if ( (root_get_flags(re) & _PAGE_RW) )
     {
@@ -482,7 +482,7 @@ get_page_from_l2e(
 {
     int rc;
 
-    ASSERT(!shadow_mode_enabled(d));
+    ASSERT(!shadow_mode_refcounts(d));
 
     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
         return 1;
@@ -512,6 +512,8 @@ static int
 get_page_from_l3e(
     l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
 {
+    ASSERT( !shadow_mode_refcounts(d) );
+
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
         return 1;
 
@@ -533,6 +535,8 @@ get_page_from_l4e(
 {
     int rc;
 
+    ASSERT( !shadow_mode_refcounts(d) );
+
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return 1;
 
@@ -641,7 +645,7 @@ static int alloc_l1_table(struct pfn_info *page)
     l1_pgentry_t  *pl1e;
     int            i;
 
-    ASSERT(!shadow_mode_enabled(d));
+    ASSERT(!shadow_mode_refcounts(d));
 
     pl1e = map_domain_mem(pfn << PAGE_SHIFT);
 
@@ -670,10 +674,12 @@ static int alloc_l2_table(struct pfn_info *page)
     l2_pgentry_t  *pl2e;
     int            i;
 
+    // See the code in shadow_promote() to understand why this is here...
     if ( (PGT_base_page_table == PGT_l2_page_table) &&
-         shadow_mode_enabled(d) )
+         unlikely(shadow_mode_refcounts(d)) )
         return 1;
-    ASSERT( !shadow_mode_enabled(d) );
+
+    ASSERT( !shadow_mode_refcounts(d) );
    
     pl2e = map_domain_mem(pfn << PAGE_SHIFT);
 
@@ -716,7 +722,7 @@ static int alloc_l3_table(struct pfn_info *page)
     l3_pgentry_t  *pl3e = page_to_virt(page);
     int            i;
 
-    ASSERT( !shadow_mode_enabled(d) );
+    ASSERT( !shadow_mode_refcounts(d) );
 
     for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
         if ( is_guest_l3_slot(i) &&
@@ -741,10 +747,12 @@ static int alloc_l4_table(struct pfn_info *page)
     l4_pgentry_t  *pl4e = page_to_virt(page);
     int            i;
 
+    // See the code in shadow_promote() to understand why this is here...
     if ( (PGT_base_page_table == PGT_l4_page_table) &&
-         shadow_mode_enabled(d) )
+         shadow_mode_refcounts(d) )
         return 1;
-    ASSERT( !shadow_mode_enabled(d) );
+
+    ASSERT( !shadow_mode_refcounts(d) );
 
     for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
         if ( is_guest_l4_slot(i) &&
@@ -861,11 +869,12 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
     l1_pgentry_t ol1e;
     struct domain *d = current->domain;
 
-    ASSERT( !shadow_mode_enabled(d) );
-
     if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
         return 0;
 
+    if ( unlikely(shadow_mode_refcounts(d)) )
+        return update_l1e(pl1e, ol1e, nl1e);
+
     if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
     {
         if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
@@ -893,7 +902,7 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
         if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
             return 0;
     }
-    
+
     put_page_from_l1e(ol1e, d);
     return 1;
 }
@@ -1095,8 +1104,19 @@ int alloc_page_type(struct pfn_info *page, unsigned int type)
 void free_page_type(struct pfn_info *page, unsigned int type)
 {
     struct domain *owner = page_get_owner(page);
-    if ( likely(owner != NULL) && unlikely(shadow_mode_enabled(owner)) )
-        return;
+    unsigned long gpfn;
+
+    if ( owner != NULL )
+    {
+        if ( unlikely(shadow_mode_refcounts(owner)) )
+            return;
+        if ( unlikely(shadow_mode_enabled(owner)) )
+        {
+            gpfn = __mfn_to_gpfn(owner, page_to_pfn(page));
+            ASSERT(VALID_M2P(gpfn));
+            remove_shadow(owner, gpfn, type);
+        }
+    }
 
     switch ( type )
     {
@@ -1287,7 +1307,7 @@ int new_guest_cr3(unsigned long mfn)
     int okay;
     unsigned long old_base_mfn;
 
-    if ( shadow_mode_enabled(d) )
+    if ( shadow_mode_refcounts(d) )
         okay = get_page_from_pagenr(mfn, d);
     else
         okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
@@ -1296,24 +1316,24 @@ int new_guest_cr3(unsigned long mfn)
     {
         invalidate_shadow_ldt(ed);
 
-        old_base_mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
+        old_base_mfn = pagetable_get_pfn(ed->arch.guest_table);
         ed->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
         update_pagetables(ed); /* update shadow_table and monitor_table */
 
         write_ptbase(ed);
 
-        if ( shadow_mode_enabled(d) )
+        if ( shadow_mode_refcounts(d) )
             put_page(&frame_table[old_base_mfn]);
         else
             put_page_and_type(&frame_table[old_base_mfn]);
 
-        /* CR3 holds its own ref to its shadow. */
+        /* CR3 also holds a ref to its shadow... */
         if ( shadow_mode_enabled(d) )
         {
             if ( ed->arch.monitor_shadow_ref )
                 put_shadow_ref(ed->arch.monitor_shadow_ref);
             ed->arch.monitor_shadow_ref =
-                pagetable_val(ed->arch.monitor_table) >> PAGE_SHIFT;
+                pagetable_get_pfn(ed->arch.monitor_table);
             ASSERT(!page_get_owner(&frame_table[ed->arch.monitor_shadow_ref]));
             get_shadow_ref(ed->arch.monitor_shadow_ref);
         }
@@ -1486,7 +1506,7 @@ int do_mmuext_op(
             type = PGT_l1_page_table | PGT_va_mutable;
 
         pin_page:
-            if ( shadow_mode_enabled(FOREIGNDOM) )
+            if ( shadow_mode_refcounts(FOREIGNDOM) )
                 type = PGT_writable_page;
 
             okay = get_page_and_type_from_pagenr(op.mfn, type, FOREIGNDOM);
@@ -1557,7 +1577,7 @@ int do_mmuext_op(
             else
             {
                 unsigned long old_mfn =
-                    pagetable_val(ed->arch.guest_table_user) >> PAGE_SHIFT;
+                    pagetable_get_pfn(ed->arch.guest_table_user);
                 ed->arch.guest_table_user = mk_pagetable(op.mfn << PAGE_SHIFT);
                 if ( old_mfn != 0 )
                     put_page_and_type(&frame_table[old_mfn]);
@@ -1785,13 +1805,16 @@ int do_mmu_update(
     unsigned int foreigndom)
 {
     mmu_update_t req;
-    unsigned long va = 0, mfn, prev_mfn = 0, gpfn;
+    void *va;
+    unsigned long gpfn, mfn;
     struct pfn_info *page;
     int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
     unsigned int cmd, done = 0;
     struct exec_domain *ed = current;
     struct domain *d = ed->domain;
     u32 type_info;
+    struct map_dom_mem_cache mapcache = MAP_DOM_MEM_CACHE_INIT;
+    struct map_dom_mem_cache sh_mapcache = MAP_DOM_MEM_CACHE_INIT;
 
     LOCK_BIGLOCK(d);
 
@@ -1841,8 +1864,6 @@ int do_mmu_update(
         }
 
         cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
-        mfn = req.ptr >> PAGE_SHIFT;
-
         okay = 0;
 
         switch ( cmd )
@@ -1851,73 +1872,75 @@ int do_mmu_update(
              * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
              */
         case MMU_NORMAL_PT_UPDATE:
+
+            gpfn = req.ptr >> PAGE_SHIFT;
+            mfn = __gpfn_to_mfn(d, gpfn);
+
             if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
             {
                 MEM_LOG("Could not get page for normal update");
                 break;
             }
 
-            if ( likely(prev_mfn == mfn) )
-            {
-                va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
-            }
-            else
-            {
-                if ( prev_mfn != 0 )
-                    unmap_domain_mem((void *)va);
-                va = (unsigned long)map_domain_mem(req.ptr);
-                prev_mfn = mfn;
-            }
-
+            va = map_domain_mem_with_cache(req.ptr, &mapcache);
             page = &frame_table[mfn];
+
             switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
             {
             case PGT_l1_page_table: 
-                ASSERT(!shadow_mode_enabled(d));
+                ASSERT( !shadow_mode_refcounts(d) );
                 if ( likely(get_page_type(
                     page, type_info & (PGT_type_mask|PGT_va_mask))) )
                 {
-                    l1_pgentry_t pte;
+                    l1_pgentry_t l1e;
 
                     /* FIXME: doesn't work with PAE */
-                    pte = l1e_create_phys(req.val, req.val);
-                    okay = mod_l1_entry((l1_pgentry_t *)va, pte);
+                    l1e = l1e_create_phys(req.val, req.val);
+                    okay = mod_l1_entry(va, l1e);
+                    if ( okay && unlikely(shadow_mode_enabled(d)) )
+                        shadow_l1_normal_pt_update(d, req.ptr, l1e, &sh_mapcache);
                     put_page_type(page);
                 }
                 break;
             case PGT_l2_page_table:
-                ASSERT(!shadow_mode_enabled(d));
+                ASSERT( !shadow_mode_refcounts(d) );
                 if ( likely(get_page_type(page, PGT_l2_page_table)) )
                 {
                     l2_pgentry_t l2e;
 
                     /* FIXME: doesn't work with PAE */
                     l2e = l2e_create_phys(req.val, req.val);
-                    okay = mod_l2_entry((l2_pgentry_t *)va, l2e, mfn);
+                    okay = mod_l2_entry(va, l2e, mfn);
+                    if ( okay && unlikely(shadow_mode_enabled(d)) )
+                        shadow_l2_normal_pt_update(d, req.ptr, l2e, &sh_mapcache);
                     put_page_type(page);
                 }
                 break;
 #ifdef __x86_64__
             case PGT_l3_page_table:
-                ASSERT(!shadow_mode_enabled(d));
+                ASSERT( !shadow_mode_refcounts(d) );
                 if ( likely(get_page_type(page, PGT_l3_page_table)) )
                 {
                     l3_pgentry_t l3e;
 
                     /* FIXME: doesn't work with PAE */
                     l3e = l3e_create_phys(req.val,req.val);
-                    okay = mod_l3_entry((l3_pgentry_t *)va, l3e, mfn);
+                    okay = mod_l3_entry(va, l3e, mfn);
+                    if ( okay && unlikely(shadow_mode_enabled(d)) )
+                        shadow_l3_normal_pt_update(d, req.ptr, l3e, &sh_mapcache);
                     put_page_type(page);
                 }
                 break;
             case PGT_l4_page_table:
-                ASSERT(!shadow_mode_enabled(d));
+                ASSERT( !shadow_mode_refcounts(d) );
                 if ( likely(get_page_type(page, PGT_l4_page_table)) )
                 {
                     l4_pgentry_t l4e;
 
                     l4e = l4e_create_phys(req.val,req.val);
-                    okay = mod_l4_entry((l4_pgentry_t *)va, l4e, mfn);
+                    okay = mod_l4_entry(va, l4e, mfn);
+                    if ( okay && unlikely(shadow_mode_enabled(d)) )
+                        shadow_l4_normal_pt_update(d, req.ptr, l4e, &sh_mapcache);
                     put_page_type(page);
                 }
                 break;
@@ -1932,9 +1955,6 @@ int do_mmu_update(
                         if ( shadow_mode_log_dirty(d) )
                             __mark_dirty(d, mfn);
 
-                        gpfn = __mfn_to_gpfn(d, mfn);
-                        ASSERT(VALID_M2P(gpfn));
-
                         if ( page_is_page_table(page) &&
                              !page_out_of_sync(page) )
                         {
@@ -1953,24 +1973,29 @@ int do_mmu_update(
                 break;
             }
 
+            unmap_domain_mem_with_cache(va, &mapcache);
+
             put_page(page);
             break;
 
         case MMU_MACHPHYS_UPDATE:
 
+            mfn = req.ptr >> PAGE_SHIFT;
+            gpfn = req.val;
+
             /* HACK ALERT...  Need to think about this some more... */
             if ( unlikely(shadow_mode_translate(FOREIGNDOM) && IS_PRIV(d)) )
             {
-                rc = FOREIGNDOM->next_io_page++;
-                printk("privileged guest dom%d requests mfn=%lx for dom%d, "
-                       "gets pfn=%x\n",
-                       d->id, mfn, FOREIGNDOM->id, rc);
-                set_machinetophys(mfn, rc);
-                set_p2m_entry(FOREIGNDOM, rc, mfn);
+                shadow_lock(FOREIGNDOM);
+                printk("privileged guest dom%d requests pfn=%lx to map mfn=%lx for dom%d\n",
+                       d->id, gpfn, mfn, FOREIGNDOM->id);
+                set_machinetophys(mfn, gpfn);
+                set_p2m_entry(FOREIGNDOM, gpfn, mfn, NULL, NULL);
                 okay = 1;
+                shadow_unlock(FOREIGNDOM);
                 break;
             }
-            
+
             if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
             {
                 MEM_LOG("Could not get page for mach->phys update");
@@ -1983,7 +2008,7 @@ int do_mmu_update(
                 break;
             }
 
-            set_machinetophys(mfn, req.val);
+            set_machinetophys(mfn, gpfn);
             okay = 1;
 
             /*
@@ -2012,8 +2037,8 @@ int do_mmu_update(
     }
 
  out:
-    if ( prev_mfn != 0 )
-        unmap_domain_mem((void *)va);
+    unmap_domain_mem_cache(&mapcache);
+    unmap_domain_mem_cache(&sh_mapcache);
 
     process_deferred_ops(cpu);
 
@@ -2031,73 +2056,6 @@ int do_mmu_update(
 /* This function assumes the caller is holding the domain's BIGLOCK
  * and is running in a shadow mode
  */
-int update_shadow_va_mapping(unsigned long va,
-                             l1_pgentry_t val,
-                             struct exec_domain *ed,
-                             struct domain *d)
-{
-    unsigned long l1mfn;
-    l1_pgentry_t spte;
-    int rc = 0;
-
-    check_pagetable(ed, "pre-va"); /* debug */
-    shadow_lock(d);
-        
-    // This is actually overkill - we don't need to sync the L1 itself,
-    // just everything involved in getting to this L1 (i.e. we need
-    // linear_pg_table[l1_linear_offset(va)] to be in sync)...
-    //
-    __shadow_sync_va(ed, va);
-
-#if 1 /* keep check_pagetables() happy */
-    /*
-     * However, the above doesn't guarantee that there's no snapshot of
-     * the L1 table in question; it just says that the relevant L2 and L1
-     * entries for VA are in-sync.  There might still be a snapshot.
-     *
-     * The checking code in _check_pagetables() assumes that no one will
-     * mutate the shadow of a page that has a snapshot.  It's actually
-     * OK to not sync this page, but it seems simpler to:
-     * 1) keep all code paths the same, and
-     * 2) maintain the invariant for _check_pagetables(), rather than try
-     *    to teach it about this boundary case.
-     * So we flush this L1 page, if it's out of sync.
-     */
-    l1mfn = l2e_get_pfn(linear_l2_table(ed)[l2_table_offset(va)]);
-    if ( mfn_out_of_sync(l1mfn) )
-    {
-        perfc_incrc(extra_va_update_sync);
-        __shadow_sync_mfn(d, l1mfn);
-    }
-#endif /* keep check_pagetables() happy */
-
-    if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
-                                 &val, sizeof(val))))
-    {
-        rc = -EINVAL;
-        goto out;
-    }
-
-    // also need to update the shadow
-
-    l1pte_propagate_from_guest(d, val, &spte);
-    shadow_set_l1e(va, spte, 0);
-
-    /*
-     * If we're in log-dirty mode then we need to note that we've updated
-     * the PTE in the PT-holding page. We need the machine frame number
-     * for this.
-     */
-    if ( shadow_mode_log_dirty(d) )
-        mark_dirty(d, va_to_l1mfn(ed, va));
-
- out:
-    shadow_unlock(d);
-    check_pagetable(ed, "post-va"); /* debug */
-
-    return rc;
-}
-
 int update_grant_va_mapping(unsigned long va,
                             l1_pgentry_t _nl1e, 
                             struct domain *d,
@@ -2116,11 +2074,17 @@ int update_grant_va_mapping(unsigned long va,
     
     cleanup_writable_pagetable(d);
 
+    // This is actually overkill - we don't need to sync the L1 itself,
+    // just everything involved in getting to this L1 (i.e. we need
+    // linear_pg_table[l1_linear_offset(va)] to be in sync)...
+    //
+    __shadow_sync_va(ed, va);
+
     pl1e = &linear_pg_table[l1_linear_offset(va)];
 
     if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
         rc = -EINVAL;
-    else
+    else if ( !shadow_mode_refcounts(d) )
     {
         if ( update_l1e(pl1e, ol1e, _nl1e) )
         {
@@ -2133,9 +2097,14 @@ int update_grant_va_mapping(unsigned long va,
         else
             rc = -EINVAL;
     }
+    else
+    {
+        printk("grant tables and shadow mode currently don't work together\n");
+        BUG();
+    }
 
     if ( unlikely(shadow_mode_enabled(d)) )
-        update_shadow_va_mapping(va, _nl1e, ed, d);
+        shadow_do_update_va_mapping(va, _nl1e, ed);
 
     return rc;
 }
@@ -2161,6 +2130,13 @@ int do_update_va_mapping(unsigned long va,
     cleanup_writable_pagetable(d);
 
     if ( unlikely(shadow_mode_enabled(d)) )
+        check_pagetable(ed, "pre-va"); /* debug */
+
+    if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
+                                val)) )
+        rc = -EINVAL;
+
+    if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
     {
         if ( unlikely(percpu_info[cpu].foreign &&
                       (shadow_mode_translate(d) ||
@@ -2173,11 +2149,10 @@ int do_update_va_mapping(unsigned long va,
             domain_crash();
         }
     
-        rc = update_shadow_va_mapping(va, val, ed, d);
+        rc = shadow_do_update_va_mapping(va, val, ed);
+
+        check_pagetable(ed, "post-va"); /* debug */
     }
-    else if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
-                                     val)) )
-        rc = -EINVAL;
 
     switch ( flags & UVMF_FLUSHTYPE_MASK )
     {
@@ -2468,14 +2443,68 @@ int ptwr_debug = 0x0;
 #define PTWR_PRINTK(_f, _a...) ((void)0)
 #endif
 
+/* Re-validate a given p.t. page, given its prior snapshot */
+int revalidate_l1(struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
+{
+    l1_pgentry_t ol1e, nl1e;
+    int modified = 0, i;
+
+#if 0
+    if ( d->id )
+        printk("%s: l1page mfn=%lx snapshot mfn=%lx\n", __func__,
+               l1e_get_pfn(linear_pg_table[l1_linear_offset((unsigned long)l1page)]),
+               l1e_get_pfn(linear_pg_table[l1_linear_offset((unsigned long)snapshot)]));
+#endif
+
+    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+    {
+        ol1e = snapshot[i];
+        nl1e = l1page[i];
+
+        if ( likely(l1e_get_value(ol1e) == l1e_get_value(nl1e)) )
+            continue;
+
+        /* Update number of entries modified. */
+        modified++;
+
+        /*
+         * Fast path for PTEs that have merely been write-protected
+         * (e.g., during a Unix fork()). A strict reduction in privilege.
+         */
+        if ( likely(l1e_get_value(ol1e) == (l1e_get_value(nl1e)|_PAGE_RW)) )
+        {
+            if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
+                put_page_type(&frame_table[l1e_get_pfn(nl1e)]);
+            continue;
+        }
+
+        if ( unlikely(!get_page_from_l1e(nl1e, d)) )
+        {
+            MEM_LOG("ptwr: Could not re-validate l1 page\n");
+            /*
+             * Make the remaining p.t's consistent before crashing, so the
+             * reference counts are correct.
+             */
+            memcpy(&l1page[i], &snapshot[i],
+                   (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
+            domain_crash();
+            break;
+        }
+        
+        put_page_from_l1e(ol1e, d);
+    }
+
+    return modified;
+}
+
+
 /* Flush the given writable p.t. page and write-protect it again. */
 void ptwr_flush(struct domain *d, const int which)
 {
     unsigned long  pte, *ptep, l1va;
-    l1_pgentry_t  *pl1e, ol1e, nl1e;
+    l1_pgentry_t  *pl1e;
     l2_pgentry_t  *pl2e;
-    int            i;
-    unsigned int   modified = 0;
+    unsigned int   modified;
 
     ASSERT(!shadow_mode_enabled(d));
 
@@ -2524,45 +2553,8 @@ void ptwr_flush(struct domain *d, const int which)
      */
 
     pl1e = d->arch.ptwr[which].pl1e;
-    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-    {
-        ol1e = d->arch.ptwr[which].page[i];
-        nl1e = pl1e[i];
-
-        if ( likely(l1e_get_value(ol1e) == l1e_get_value(nl1e)) )
-            continue;
-
-        /* Update number of entries modified. */
-        modified++;
-
-        /*
-         * Fast path for PTEs that have merely been write-protected
-         * (e.g., during a Unix fork()). A strict reduction in privilege.
-         */
-        if ( likely(l1e_get_value(ol1e) == (l1e_get_value(nl1e)|_PAGE_RW)) )
-        {
-            if ( likely(l1e_get_flags(nl1e) & _PAGE_PRESENT) )
-                put_page_type(&frame_table[l1e_get_pfn(nl1e)]);
-            continue;
-        }
-
-        if ( unlikely(!get_page_from_l1e(nl1e, d)) )
-        {
-            MEM_LOG("ptwr: Could not re-validate l1 page\n");
-            /*
-             * Make the remaining p.t's consistent before crashing, so the
-             * reference counts are correct.
-             */
-            memcpy(&pl1e[i], &d->arch.ptwr[which].page[i],
-                   (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
-            domain_crash();
-            break;
-        }
-        
-        put_page_from_l1e(ol1e, d);
-    }
+    modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page);
     unmap_domain_mem(pl1e);
-    
     perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
     d->arch.ptwr[which].prev_nr_updates  = modified;
 
index 1db8ed71e9b57580e52a2f70eb85317d4b9c3860..817757c36ae0fcf63ef6cb48b235ffe8918950fe 100644 (file)
 #include <xen/sched.h>
 #include <xen/trace.h>
 
+#define MFN_PINNED(_x) (frame_table[_x].u.inuse.type_info & PGT_pinned)
+
 static void shadow_free_snapshot(struct domain *d,
                                  struct out_of_sync_entry *entry);
 static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn);
 static void free_writable_pte_predictions(struct domain *d);
 
+#if SHADOW_DEBUG
+static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn);
+#endif
+
 /********
 
 There's a per-domain shadow table spin lock which works fine for SMP
@@ -62,6 +68,9 @@ shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
         __shadow_sync_mfn(d, gmfn);
     }
 
+    if ( !shadow_mode_refcounts(d) )
+        return 1;
+
     if ( unlikely(page_is_page_table(page)) )
         return 1;
 
@@ -89,7 +98,7 @@ shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
     // TLB flushes required when promoting a writable page, and also deal
     // with any outstanding (external) writable refs to this page (by
     // refusing to promote it).  The pinning headache complicates this
-    // code -- it would all much get simpler if we stop using
+    // code -- it would all get much simpler if we stop using
     // shadow_lock() and move the shadow code to BIGLOCK().
     //
     if ( unlikely(!get_page(page, d)) )
@@ -130,6 +139,9 @@ shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
 static inline void
 shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
 {
+    if ( !shadow_mode_refcounts(d) )
+        return;
+
     ASSERT(frame_table[gmfn].count_info & PGC_page_table);
 
     if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
@@ -210,7 +222,7 @@ alloc_shadow_page(struct domain *d,
         else
         {
             page = alloc_domheap_page(NULL);
-            void *l1 = map_domain_mem(page_to_pfn(page) << PAGE_SHIFT);
+            void *l1 = map_domain_mem(page_to_phys(page));
             memset(l1, 0, PAGE_SIZE);
             unmap_domain_mem(l1);
         }
@@ -312,7 +324,7 @@ free_shadow_l1_table(struct domain *d, unsigned long smfn)
 
     for ( i = min; i <= max; i++ )
     {
-        put_page_from_l1e(pl1e[i], d);
+        shadow_put_page_from_l1e(pl1e[i], d);
         pl1e[i] = l1e_empty();
     }
 
@@ -348,21 +360,20 @@ free_shadow_hl2_table(struct domain *d, unsigned long smfn)
 static void inline
 free_shadow_l2_table(struct domain *d, unsigned long smfn)
 {
-    unsigned long *pl2e = map_domain_mem(smfn << PAGE_SHIFT);
+    l2_pgentry_t *pl2e = map_domain_mem(smfn << PAGE_SHIFT);
     int i, external = shadow_mode_external(d);
 
     for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
         if ( external || is_guest_l2_slot(i) )
-            if ( pl2e[i] & _PAGE_PRESENT )
-                put_shadow_ref(pl2e[i] >> PAGE_SHIFT);
+            if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT )
+                put_shadow_ref(l2e_get_pfn(pl2e[i]));
 
     if ( (PGT_base_page_table == PGT_l2_page_table) &&
          shadow_mode_translate(d) && !external )
     {
         // free the ref to the hl2
         //
-        put_shadow_ref(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]
-                       >> PAGE_SHIFT);
+        put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]));
     }
 
     unmap_domain_mem(pl2e);
@@ -428,6 +439,26 @@ void free_shadow_page(unsigned long smfn)
         free_domheap_page(page);
 }
 
+void
+remove_shadow(struct domain *d, unsigned long gpfn, u32 stype)
+{
+    unsigned long smfn;
+
+    //printk("%s(gpfn=%lx, type=%x)\n", __func__, gpfn, stype);
+
+    shadow_lock(d);
+
+    while ( stype >= PGT_l1_shadow )
+    {
+        smfn = __shadow_status(d, gpfn, stype);
+        if ( smfn && MFN_PINNED(smfn) )
+            shadow_unpin(smfn);
+        stype -= PGT_l1_shadow;
+    }
+
+    shadow_unlock(d);
+}
+
 static void inline
 release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
 {
@@ -537,15 +568,22 @@ static void free_shadow_pages(struct domain *d)
     //
     free_out_of_sync_state(d);
 
-    // second, remove any outstanding refs from ed->arch.shadow_table...
+    // second, remove any outstanding refs from ed->arch.shadow_table
+    // and CR3.
     //
     for_each_exec_domain(d, ed)
     {
         if ( pagetable_val(ed->arch.shadow_table) )
         {
-            put_shadow_ref(pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT);
+            put_shadow_ref(pagetable_get_pfn(ed->arch.shadow_table));
             ed->arch.shadow_table = mk_pagetable(0);
         }
+
+        if ( ed->arch.monitor_shadow_ref )
+        {
+            put_shadow_ref(ed->arch.monitor_shadow_ref);
+            ed->arch.monitor_shadow_ref = 0;
+        }
     }
 
     // For external shadows, remove the monitor table's refs
@@ -584,7 +622,6 @@ static void free_shadow_pages(struct domain *d)
     // under us...  First, collect the list of pinned pages, then
     // free them.
     //
-#define PINNED(_x) (frame_table[_x].u.inuse.type_info & PGT_pinned)
     for ( i = 0; i < shadow_ht_buckets; i++ )
     {
         u32 count;
@@ -596,7 +633,7 @@ static void free_shadow_pages(struct domain *d)
 
         count = 0;
         for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
-            if ( PINNED(x->smfn) )
+            if ( MFN_PINNED(x->smfn) )
                 count++;
         if ( !count )
             continue;
@@ -604,7 +641,7 @@ static void free_shadow_pages(struct domain *d)
         mfn_list = xmalloc_array(unsigned long, count);
         count = 0;
         for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
-            if ( PINNED(x->smfn) )
+            if ( MFN_PINNED(x->smfn) )
                 mfn_list[count++] = x->smfn;
 
         while ( count )
@@ -613,7 +650,18 @@ static void free_shadow_pages(struct domain *d)
         }
         xfree(mfn_list);
     }
-#undef PINNED
+
+    // Now free the pre-zero'ed pages from the domain
+    //
+    struct list_head *list_ent, *tmp;
+    list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames)
+    {
+        list_del(list_ent);
+        perfc_decr(free_l1_pages);
+
+        struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
+        free_domheap_page(page);
+    }
 
     shadow_audit(d, 0);
 
@@ -624,9 +672,9 @@ void shadow_mode_init(void)
 {
 }
 
-int _shadow_mode_enabled(struct domain *d)
+int _shadow_mode_refcounts(struct domain *d)
 {
-    return shadow_mode_enabled(d);
+    return shadow_mode_refcounts(d);
 }
 
 static void alloc_monitor_pagetable(struct exec_domain *ed)
@@ -706,7 +754,7 @@ void free_monitor_pagetable(struct exec_domain *ed)
     /*
      * Then free monitor_table.
      */
-    mfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT;
+    mfn = pagetable_get_pfn(ed->arch.monitor_table);
     free_domheap_page(&frame_table[mfn]);
 
     ed->arch.monitor_table = mk_pagetable(0);
@@ -714,7 +762,9 @@ void free_monitor_pagetable(struct exec_domain *ed)
 }
 
 int
-set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn)
+set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn,
+              struct map_dom_mem_cache *l2cache,
+              struct map_dom_mem_cache *l1cache)
 {
     unsigned long phystab = pagetable_val(d->arch.phys_table);
     l2_pgentry_t *l2, l2e;
@@ -724,26 +774,29 @@ set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn)
 
     ASSERT( phystab );
 
-    l2 = map_domain_mem(phystab);
+    l2 = map_domain_mem_with_cache(phystab, l2cache);
     l2e = l2[l2_table_offset(va)];
-    if ( !l2e_get_value(l2e) ) /* FIXME: check present bit? */
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
     {
         l1page = alloc_domheap_page(NULL);
         if ( !l1page )
+        {
+            unmap_domain_mem_with_cache(l2, l2cache);
             return 0;
+        }
 
-        l1 = map_domain_mem(page_to_pfn(l1page) << PAGE_SHIFT);
+        l1 = map_domain_mem_with_cache(page_to_phys(l1page), l1cache);
         memset(l1, 0, PAGE_SIZE);
-        unmap_domain_mem(l1);
+        unmap_domain_mem_with_cache(l1, l1cache);
 
         l2e = l2e_create_pfn(page_to_pfn(l1page), __PAGE_HYPERVISOR);
         l2[l2_table_offset(va)] = l2e;
     }
-    unmap_domain_mem(l2);
+    unmap_domain_mem_with_cache(l2, l2cache);
 
-    l1 = map_domain_mem(l2e_get_phys(l2e));
+    l1 = map_domain_mem_with_cache(l2e_get_phys(l2e), l1cache);
     l1[l1_table_offset(va)] = l1e_create_pfn(mfn, __PAGE_HYPERVISOR);
-    unmap_domain_mem(l1);
+    unmap_domain_mem_with_cache(l1, l1cache);
 
     return 1;
 }
@@ -755,14 +808,16 @@ alloc_p2m_table(struct domain *d)
     struct pfn_info *page, *l2page;
     l2_pgentry_t *l2;
     unsigned long mfn, pfn;
+    struct map_dom_mem_cache l2cache = MAP_DOM_MEM_CACHE_INIT;
+    struct map_dom_mem_cache l1cache = MAP_DOM_MEM_CACHE_INIT;
 
     l2page = alloc_domheap_page(NULL);
     if ( !l2page )
         return 0;
-    d->arch.phys_table = mk_pagetable(page_to_pfn(l2page) << PAGE_SHIFT);
-    l2 = map_domain_mem(page_to_pfn(l2page) << PAGE_SHIFT);
+    d->arch.phys_table = mk_pagetable(page_to_phys(l2page));
+    l2 = map_domain_mem_with_cache(page_to_phys(l2page), &l2cache);
     memset(l2, 0, PAGE_SIZE);
-    unmap_domain_mem(l2);
+    unmap_domain_mem_with_cache(l2, &l2cache);
 
     list_ent = d->page_list.next;
     while ( list_ent != &d->page_list )
@@ -773,7 +828,7 @@ alloc_p2m_table(struct domain *d)
         ASSERT(pfn != INVALID_M2P_ENTRY);
         ASSERT(pfn < (1u<<20));
 
-        set_p2m_entry(d, pfn, mfn);
+        set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache);
 
         list_ent = page->list.next;
     }
@@ -787,12 +842,15 @@ alloc_p2m_table(struct domain *d)
         if ( (pfn != INVALID_M2P_ENTRY) &&
              (pfn < (1u<<20)) )
         {
-            set_p2m_entry(d, pfn, mfn);
+            set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache);
         }
 
         list_ent = page->list.next;
     }
 
+    unmap_domain_mem_cache(&l2cache);
+    unmap_domain_mem_cache(&l1cache);
+
     return 1;
 }
 
@@ -915,13 +973,13 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode)
         {
             // external guests provide their own memory for their P2M maps.
             //
-            ASSERT( d == page_get_owner(&frame_table[pagetable_val(
-                                        d->arch.phys_table)>>PAGE_SHIFT]) );
+            ASSERT( d == page_get_owner(
+                        &frame_table[pagetable_get_pfn(d->arch.phys_table)]) );
         }
     }
 
     printk("audit1\n");
-    _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK);
+    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
     printk("audit1 done\n");
 
     // Get rid of any shadow pages from any previous shadow mode.
@@ -929,15 +987,9 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode)
     free_shadow_pages(d);
 
     printk("audit2\n");
-    _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK);
+    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
     printk("audit2 done\n");
 
-    // Turn off writable page tables.
-    // It doesn't mix with shadow mode.
-    // And shadow mode offers a superset of functionality.
-    //
-    vm_assist(d, VMASST_CMD_disable, VMASST_TYPE_writable_pagetables);
-
     /*
      * Tear down it's counts by disassembling its page-table-based ref counts.
      * Also remove CR3's gcount/tcount.
@@ -959,23 +1011,27 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode)
      * Assert that no pages are left with L1/L2/L3/L4 type.
      */
     audit_adjust_pgtables(d, -1, 1);
+
     d->arch.shadow_mode = mode;
 
-    struct list_head *list_ent = d->page_list.next;
-    while ( list_ent != &d->page_list )
+    if ( shadow_mode_refcounts(d) )
     {
-        struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
-        if ( !get_page_type(page, PGT_writable_page) )
-            BUG();
-        put_page_type(page);
+        struct list_head *list_ent = d->page_list.next;
+        while ( list_ent != &d->page_list )
+        {
+            struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
+            if ( !get_page_type(page, PGT_writable_page) )
+                BUG();
+            put_page_type(page);
 
-        list_ent = page->list.next;
+            list_ent = page->list.next;
+        }
     }
 
     audit_adjust_pgtables(d, 1, 1);
 
     printk("audit3\n");
-    _audit_domain(d, AUDIT_ALREADY_LOCKED);
+    _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK);
     printk("audit3 done\n");
 
     return 0;
@@ -1120,8 +1176,8 @@ void __shadow_mode_disable(struct domain *d)
      * Currently this does not fix up page ref counts, so it is valid to call
      * only when a domain is being destroyed.
      */
-    BUG_ON(!test_bit(DF_DYING, &d->d_flags));
-    d->arch.shadow_tainted_refcnts = 1;
+    BUG_ON(!test_bit(DF_DYING, &d->d_flags) && shadow_mode_refcounts(d));
+    d->arch.shadow_tainted_refcnts = shadow_mode_refcounts(d);
 
     free_shadow_pages(d);
     free_writable_pte_predictions(d);
@@ -1138,11 +1194,17 @@ void __shadow_mode_disable(struct domain *d)
         }
     }
 #endif
-    
+
     d->arch.shadow_mode = 0;
 
     free_shadow_ht_entries(d);
     free_out_of_sync_entries(d);
+
+    struct exec_domain *ed;
+    for_each_exec_domain(d, ed)
+    {
+        update_pagetables(ed);
+    }
 }
 
 static int shadow_mode_table_op(
@@ -1281,6 +1343,7 @@ int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
     switch ( op )
     {
     case DOM0_SHADOW_CONTROL_OP_OFF:
+        __shadow_sync_all(d);
         __shadow_mode_disable(d);
         break;
 
@@ -1298,7 +1361,7 @@ int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
     case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
         free_shadow_pages(d);
         rc = __shadow_mode_enable(
-            d, d->arch.shadow_mode|SHM_enable|SHM_translate);
+            d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate);
         break;
 
     default:
@@ -1560,23 +1623,23 @@ void shadow_map_l1_into_current_l2(unsigned long va)
 
     if ( init_table )
     {
+        l1_pgentry_t sl1e;
+        int index = l1_table_offset(va);
+        int min = 1, max = 0;
+
         gpl1e = &(linear_pg_table[l1_linear_offset(va) &
                               ~(L1_PAGETABLE_ENTRIES-1)]);
 
         spl1e = &(shadow_linear_pg_table[l1_linear_offset(va) &
                                      ~(L1_PAGETABLE_ENTRIES-1)]);
 
-        l1_pgentry_t sl1e;
-        int index = l1_table_offset(va);
-        int min = 1, max = 0;
-
         for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
         {
             l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
             if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
-                 !shadow_get_page_from_l1e(sl1e, d) )
+                 unlikely(!shadow_get_page_from_l1e(sl1e, d)) )
                 sl1e = l1e_empty();
-            if ( l1e_get_value(sl1e) == 0 ) /* FIXME: check flags? */
+            if ( l1e_get_flags(sl1e) == 0 )
             {
                 // First copy entries from 0 until first invalid.
                 // Then copy entries from index until first invalid.
@@ -1695,7 +1758,8 @@ shadow_make_snapshot(
     if ( !get_shadow_ref(smfn) )
         BUG();
 
-    if ( shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow )
+    if ( shadow_mode_refcounts(d) &&
+         (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) )
         min_max = pfn_to_page(sl1mfn)->tlbflush_timestamp;
     pfn_to_page(smfn)->tlbflush_timestamp = min_max;
 
@@ -1748,7 +1812,18 @@ shadow_mark_mfn_out_of_sync(struct exec_domain *ed, unsigned long gpfn,
 
     ASSERT(spin_is_locked(&d->arch.shadow_lock));
     ASSERT(pfn_valid(mfn));
-    ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page);
+
+#ifndef NDEBUG
+    u32 type = page->u.inuse.type_info & PGT_type_mask;
+    if ( shadow_mode_refcounts(d) )
+    {
+        ASSERT(type == PGT_writable_page);
+    }
+    else
+    {
+        ASSERT(type && (type < PGT_l4_page_table));
+    }
+#endif
 
     FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08x", __func__,
             gpfn, mfn, page->count_info, page->u.inuse.type_info);
@@ -1766,6 +1841,10 @@ shadow_mark_mfn_out_of_sync(struct exec_domain *ed, unsigned long gpfn,
     entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
     entry->writable_pl1e = -1;
 
+#if SHADOW_DEBUG
+    mark_shadows_as_reflecting_snapshot(d, gpfn);
+#endif
+
     // increment guest's ref count to represent the entry in the
     // full shadow out-of-sync list.
     //
@@ -1859,7 +1938,7 @@ static int snapshot_entry_matches(
 int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va)
 {
     struct domain *d = ed->domain;
-    unsigned long l2mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
+    unsigned long l2mfn = pagetable_get_pfn(ed->arch.guest_table);
     l2_pgentry_t l2e;
     unsigned long l1mfn;
 
@@ -1867,6 +1946,10 @@ int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va)
 
     perfc_incrc(shadow_out_of_sync_calls);
 
+    // PERF BUG: snapshot_entry_matches will call map_domain_mem() on the l2
+    // page, but it's already available at ed->arch.guest_vtable...
+    // Ditto for the sl2 page and ed->arch.shadow_vtable.
+    //
     if ( page_out_of_sync(&frame_table[l2mfn]) &&
          !snapshot_entry_matches(ed, l2mfn, l2_table_offset(va)) )
         return 1;
@@ -1881,6 +1964,10 @@ int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va)
     if ( !VALID_MFN(l1mfn) )
         return 0;
 
+    // PERF BUG: snapshot_entry_matches will call map_domain_mem() on the l1
+    // page, but it's already available at linear_pg_table[l1_linear_offset()].
+    // Ditto for the sl1 page and shadow_linear_pg_table[]...
+    //
     if ( page_out_of_sync(&frame_table[l1mfn]) &&
          !snapshot_entry_matches(ed, l1mfn, l1_table_offset(va)) )
         return 1;
@@ -2002,7 +2089,7 @@ static u32 remove_all_write_access_in_ptpage(
         found++;
         pt[i] = new;
         if ( is_l1_shadow )
-            put_page_from_l1e(old, d);
+            shadow_put_page_from_l1e(old, d);
 
 #if 0
         printk("removed write access to pfn=%lx mfn=%lx in smfn=%lx entry %x "
@@ -2060,8 +2147,7 @@ int shadow_remove_all_write_access(
     //
     write_refs =
         (frame_table[readonly_gmfn].u.inuse.type_info & PGT_count_mask);
-    if ( write_refs &&
-         (frame_table[readonly_gmfn].u.inuse.type_info & PGT_pinned) )
+    if ( write_refs && MFN_PINNED(readonly_gmfn) )
     {
         write_refs--;
     }
@@ -2141,7 +2227,7 @@ static u32 remove_all_access_in_page(
             count++;
 
             if ( is_l1_shadow )
-                put_page_from_l1e(ol2e, d);
+                shadow_put_page_from_l1e(ol2e, d);
             else /* must be an hl2 page */
                 put_page(&frame_table[forbidden_gmfn]);
         }
@@ -2210,8 +2296,23 @@ static int resync_all(struct domain *d, u32 stype)
         if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
             continue;
 
-        if ( !(smfn = __shadow_status(d, entry->gpfn, stype)) )
-            continue;
+        smfn = __shadow_status(d, entry->gpfn, stype);
+
+        if ( !smfn )
+        {
+            if ( shadow_mode_refcounts(d) )
+                continue;
+
+            // For light weight shadows, even when no shadow page exists,
+            // we need to resync the refcounts to the new contents of the
+            // guest page.
+            // This only applies when we have writable page tables.
+            //
+            if ( (stype == PGT_l1_shadow) && !VM_ASSIST(d, VMASST_TYPE_writable_pagetables) )
+                continue;
+            if ( (stype != PGT_l1_shadow) && !shadow_mode_write_all(d) )
+                continue;
+        }
 
         FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx",
                 stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
@@ -2221,12 +2322,29 @@ static int resync_all(struct domain *d, u32 stype)
         //
         guest    = map_domain_mem(entry->gmfn         << PAGE_SHIFT);
         snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
-        shadow   = map_domain_mem(smfn                << PAGE_SHIFT);
+
+        if ( smfn )
+            shadow = map_domain_mem(smfn << PAGE_SHIFT);
+        else
+            shadow = NULL;
+
         unshadow = 0;
 
         switch ( stype ) {
         case PGT_l1_shadow:
         {
+            l1_pgentry_t *guest1 = guest;
+            l1_pgentry_t *shadow1 = shadow;
+            l1_pgentry_t *snapshot1 = snapshot;
+
+            ASSERT(VM_ASSIST(d, VMASST_TYPE_writable_pagetables));
+
+            if ( !shadow_mode_refcounts(d) )
+                revalidate_l1(d, guest1, snapshot1);
+
+            if ( !smfn )
+                break;
+
             u32 min_max_shadow = pfn_to_page(smfn)->tlbflush_timestamp;
             int min_shadow = SHADOW_MIN(min_max_shadow);
             int max_shadow = SHADOW_MAX(min_max_shadow);
@@ -2236,10 +2354,6 @@ static int resync_all(struct domain *d, u32 stype)
             int min_snapshot = SHADOW_MIN(min_max_snapshot);
             int max_snapshot = SHADOW_MAX(min_max_snapshot);
 
-            l1_pgentry_t *guest1 = guest;
-            l1_pgentry_t *shadow1 = shadow;
-            l1_pgentry_t *snapshot1 = snapshot;
-
             changed = 0;
 
             for ( i = min_shadow; i <= max_shadow; i++ )
@@ -2270,6 +2384,9 @@ static int resync_all(struct domain *d, u32 stype)
             l2_pgentry_t *shadow2 = shadow;
             l2_pgentry_t *snapshot2 = snapshot;
 
+            ASSERT(shadow_mode_write_all(d));
+            BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
+
             changed = 0;
             for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
             {
@@ -2295,8 +2412,7 @@ static int resync_all(struct domain *d, u32 stype)
                 //       Need a better solution long term.
                 if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
                      unlikely(l2e_get_value(new_pde) != 0) &&
-                     !unshadow &&
-                     (frame_table[smfn].u.inuse.type_info & PGT_pinned) )
+                     !unshadow && MFN_PINNED(smfn) )
                     unshadow = 1;
             }
             if ( max == -1 )
@@ -2311,6 +2427,9 @@ static int resync_all(struct domain *d, u32 stype)
             l2_pgentry_t *snapshot2 = snapshot;
             l1_pgentry_t *shadow2 = shadow;
             
+            ASSERT(shadow_mode_write_all(d));
+            BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
+
             changed = 0;
             for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
             {
@@ -2338,7 +2457,8 @@ static int resync_all(struct domain *d, u32 stype)
             BUG();
         }
 
-        unmap_domain_mem(shadow);
+        if ( smfn )
+            unmap_domain_mem(shadow);
         unmap_domain_mem(snapshot);
         unmap_domain_mem(guest);
 
@@ -2351,7 +2471,7 @@ static int resync_all(struct domain *d, u32 stype)
                 unsigned long hl2mfn;
 
                 if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
-                     (frame_table[hl2mfn].u.inuse.type_info & PGT_pinned) )
+                     MFN_PINNED(hl2mfn) )
                     shadow_unpin(hl2mfn);
             }
         }
@@ -2388,7 +2508,7 @@ void __shadow_sync_all(struct domain *d)
              !shadow_get_page_from_l1e(npte, d) )
             BUG();
         *ppte = npte;
-        put_page_from_l1e(opte, d);
+        shadow_put_page_from_l1e(opte, d);
 
         unmap_domain_mem(ppte);
     }
@@ -2475,13 +2595,23 @@ int shadow_fault(unsigned long va, struct xen_regs *regs)
     /* Write fault? */
     if ( regs->error_code & 2 )  
     {
+        int allow_writes = 0;
+
         if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
         {
-            /* Write fault on a read-only mapping. */
-            SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%lx)", 
-                     l1e_get_value(gpte));
-            perfc_incrc(shadow_fault_bail_ro_mapping);
-            goto fail;
+            if ( shadow_mode_page_writable(d, l1e_get_pfn(gpte)) )
+            {
+                allow_writes = 1;
+                l1e_add_flags(&gpte, _PAGE_RW);
+            }
+            else
+            {
+                /* Write fault on a read-only mapping. */
+                SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%lx)", 
+                         l1e_get_value(gpte));
+                perfc_incrc(shadow_fault_bail_ro_mapping);
+                goto fail;
+            }
         }
 
         if ( !l1pte_write_fault(ed, &gpte, &spte, va) )
@@ -2491,6 +2621,9 @@ int shadow_fault(unsigned long va, struct xen_regs *regs)
             shadow_unlock(d);
             return 0;
         }
+
+        if ( allow_writes )
+            l1e_remove_flags(&gpte, _PAGE_RW);
     }
     else
     {
@@ -2506,21 +2639,22 @@ int shadow_fault(unsigned long va, struct xen_regs *regs)
     /*
      * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
      */
-
-    /* XXX Watch out for read-only L2 entries! (not used in Linux). */
-    if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
-                                 &gpte, sizeof(gpte))) )
+    if ( l1e_has_changed(&orig_gpte, &gpte, PAGE_FLAG_MASK) )
     {
-        printk("shadow_fault() failed, crashing domain %d "
-               "due to a read-only L2 page table (gpde=%lx), va=%lx\n",
-               d->id, l2e_get_value(gpde), va);
-        domain_crash_synchronous();
-    }
+        /* XXX Watch out for read-only L2 entries! (not used in Linux). */
+        if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
+                                     &gpte, sizeof(gpte))) )
+        {
+            printk("%s() failed, crashing domain %d "
+                   "due to a read-only L2 page table (gpde=%lx), va=%lx\n",
+                   __func__, d->id, l2e_get_value(gpde), va);
+            domain_crash_synchronous();
+        }
 
-    // if necessary, record the page table page as dirty
-    if ( unlikely(shadow_mode_log_dirty(d)) &&
-         l1e_has_changed(&orig_gpte, &gpte, PAGE_FLAG_MASK))
-        mark_dirty(d, __gpfn_to_mfn(d, l2e_get_pfn(gpde)));
+        // if necessary, record the page table page as dirty
+        if ( unlikely(shadow_mode_log_dirty(d)) )
+            __mark_dirty(d, __gpfn_to_mfn(d, l2e_get_pfn(gpde)));
+    }
 
     shadow_set_l1e(va, spte, 1);
 
@@ -2537,6 +2671,109 @@ int shadow_fault(unsigned long va, struct xen_regs *regs)
     return 0;
 }
 
+void shadow_l1_normal_pt_update(
+    struct domain *d,
+    unsigned long pa, l1_pgentry_t gpte,
+    struct map_dom_mem_cache *cache)
+{
+    unsigned long sl1mfn;    
+    l1_pgentry_t *spl1e, spte;
+
+    shadow_lock(d);
+
+    sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow);
+    if ( sl1mfn )
+    {
+        SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%08lx",
+                 (void *)pa, l1e_get_value(gpte));
+        l1pte_propagate_from_guest(current->domain, gpte, &spte);
+
+        spl1e = map_domain_mem_with_cache(sl1mfn << PAGE_SHIFT, cache);
+        spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte;
+        unmap_domain_mem_with_cache(spl1e, cache);
+    }
+
+    shadow_unlock(d);
+}
+
+void shadow_l2_normal_pt_update(
+    struct domain *d,
+    unsigned long pa, l2_pgentry_t gpde,
+    struct map_dom_mem_cache *cache)
+{
+    unsigned long sl2mfn;
+    l2_pgentry_t *spl2e;
+
+    shadow_lock(d);
+
+    sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow);
+    if ( sl2mfn )
+    {
+        SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%08lx",
+                 (void *)pa, l2e_get_value(gpde));
+        spl2e = map_domain_mem_with_cache(sl2mfn << PAGE_SHIFT, cache);
+        validate_pde_change(d, gpde,
+                            &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]);
+        unmap_domain_mem_with_cache(spl2e, cache);
+    }
+
+    shadow_unlock(d);
+}
+
+#ifdef __x86_64__
+void shadow_l3_normal_pt_update(
+    struct domain *d,
+    unsigned long pa, l3_pgentry_t gpde,
+    struct map_dom_mem_cache *cache)
+{
+    BUG(); // not yet implemented
+}
+
+void shadow_l4_normal_pt_update(
+    struct domain *d,
+    unsigned long pa, l4_pgentry_t gpde,
+    struct map_dom_mem_cache *cache)
+{
+    BUG(); // not yet implemented
+}
+#endif
+
+int shadow_do_update_va_mapping(unsigned long va,
+                                l1_pgentry_t val,
+                                struct exec_domain *ed)
+{
+    struct domain *d = ed->domain;
+    l1_pgentry_t spte;
+    int rc = 0;
+
+    shadow_lock(d);
+
+    //printk("%s(va=%p, val=%p)\n", __func__, (void *)va, (void *)l1e_get_value(val));
+        
+    // This is actually overkill - we don't need to sync the L1 itself,
+    // just everything involved in getting to this L1 (i.e. we need
+    // linear_pg_table[l1_linear_offset(va)] to be in sync)...
+    //
+    __shadow_sync_va(ed, va);
+
+    l1pte_propagate_from_guest(d, val, &spte);
+    shadow_set_l1e(va, spte, 0);
+
+    /*
+     * If we're in log-dirty mode then we need to note that we've updated
+     * the PTE in the PT-holding page. We need the machine frame number
+     * for this.
+     */
+    if ( shadow_mode_log_dirty(d) )
+        __mark_dirty(d, va_to_l1mfn(ed, va));
+
+// out:
+    shadow_unlock(d);
+
+    return rc;
+}
+
+
 /*
  * What lives where in the 32-bit address space in the various shadow modes,
  * and what it uses to get/maintain that mapping.
@@ -2566,7 +2803,7 @@ int shadow_fault(unsigned long va, struct xen_regs *regs)
 void __update_pagetables(struct exec_domain *ed)
 {
     struct domain *d = ed->domain;
-    unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
+    unsigned long gmfn = pagetable_get_pfn(ed->arch.guest_table);
     unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
     unsigned long smfn, hl2mfn, old_smfn;
 
@@ -2595,7 +2832,7 @@ void __update_pagetables(struct exec_domain *ed)
         smfn = shadow_l2_table(d, gpfn, gmfn);
     if ( !get_shadow_ref(smfn) )
         BUG();
-    old_smfn = pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT;
+    old_smfn = pagetable_get_pfn(ed->arch.shadow_table);
     ed->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT);
     if ( old_smfn )
         put_shadow_ref(old_smfn);
@@ -2665,6 +2902,47 @@ void __update_pagetables(struct exec_domain *ed)
 
 #if SHADOW_DEBUG
 
+// The following is entirely for _check_pagetable()'s benefit.
+// _check_pagetable() wants to know whether a given entry in a
+// shadow page table is supposed to be the shadow of the guest's
+// current entry, or the shadow of the entry held in the snapshot
+// taken above.
+//
+// Here, we mark all currently existing entries as reflecting
+// the snapshot, above.  All other places in xen that update
+// the shadow will keep the shadow in sync with the guest's
+// entries (via l1pte_propagate_from_guest and friends), which clear
+// the SHADOW_REFLECTS_SNAPSHOT bit.
+//
+static void
+mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn)
+{
+    unsigned long smfn;
+    l1_pgentry_t *l1e;
+    l2_pgentry_t *l2e;
+    unsigned i;
+
+    if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) )
+    {
+        l1e = map_domain_mem(smfn << PAGE_SHIFT);
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+            if ( is_guest_l1_slot(i) &&
+                 (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) )
+                l1e_add_flags(&l1e[i], SHADOW_REFLECTS_SNAPSHOT);
+        unmap_domain_mem(l1e);
+    }
+
+    if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) )
+    {
+        l2e = map_domain_mem(smfn << PAGE_SHIFT);
+        for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+            if ( is_guest_l2_slot(i) &&
+                 (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) )
+                l2e_add_flags(&l2e[i], SHADOW_REFLECTS_SNAPSHOT);
+        unmap_domain_mem(l2e);
+    }
+}
+
 // BUG: these are not SMP safe...
 static int sh_l2_present;
 static int sh_l1_present;
@@ -2687,96 +2965,109 @@ int shadow_status_noswap;
 
 #define FAIL(_f, _a...)                                                      \
     do {                                                                     \
-        printk("XXX %s-FAIL (%d,%d,%d)" _f " at %s(%d)\n",                   \
+        printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n",                  \
                sh_check_name, level, l2_idx, l1_idx, ## _a,                  \
                __FILE__, __LINE__);                                          \
-        printk("g=%lx s=%lx &g=%p &s=%p"                                     \
-               " v2m(&g)=%08lx v2m(&s)=%08lx ea=%08x\n",                     \
-               l1e_get_value(gpte), l1e_get_value(spte), pgpte, pspte,       \
-               v2m(ed, pgpte), v2m(ed, pspte),                               \
+        printk("guest_pte=%lx eff_guest_pte=%lx shadow_pte=%lx "             \
+               "snapshot_pte=%lx &guest=%p &shadow=%p &snap=%p "             \
+               "v2m(&guest)=%p v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n",     \
+               l1e_get_value(guest_pte), l1e_get_value(eff_guest_pte),       \
+               l1e_get_value(shadow_pte), l1e_get_value(snapshot_pte),       \
+               p_guest_pte, p_shadow_pte, p_snapshot_pte,                    \
+               (void *)v2m(ed, p_guest_pte), (void *)v2m(ed, p_shadow_pte),  \
+               (void *)v2m(ed, p_snapshot_pte),                              \
                (l2_idx << L2_PAGETABLE_SHIFT) |                              \
                (l1_idx << L1_PAGETABLE_SHIFT));                              \
         errors++;                                                            \
     } while ( 0 )
 
 static int check_pte(
-    struct exec_domain *ed, l1_pgentry_t *pgpte, l1_pgentry_t *pspte, 
-    int level, int l2_idx, int l1_idx, int oos_ptes)
+    struct exec_domain *ed,
+    l1_pgentry_t *p_guest_pte,
+    l1_pgentry_t *p_shadow_pte,
+    l1_pgentry_t *p_snapshot_pte,
+    int level, int l2_idx, int l1_idx)
 {
     struct domain *d = ed->domain;
-    l1_pgentry_t gpte = *pgpte;
-    l1_pgentry_t spte = *pspte;
-    unsigned long mask, gpfn, smfn, gmfn;
-    int errors = 0;
+    l1_pgentry_t guest_pte = *p_guest_pte;
+    l1_pgentry_t shadow_pte = *p_shadow_pte;
+    l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty();
+    l1_pgentry_t eff_guest_pte;
+    unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn;
+    int errors = 0, guest_writable;
     int page_table_page;
 
-    if ( (l1e_get_value(spte) == 0) ||
-         (l1e_get_value(spte) == 0xdeadface) ||
-         (l1e_get_value(spte) == 0x00000E00) )
+    if ( (l1e_get_value(shadow_pte) == 0) ||
+         (l1e_get_value(shadow_pte) == 0xdeadface) ||
+         (l1e_get_value(shadow_pte) == 0x00000E00) )
         return errors;  /* always safe */
 
-    if ( !(l1e_get_flags(spte) & _PAGE_PRESENT) )
-        FAIL("Non zero not present spte");
+    if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) )
+        FAIL("Non zero not present shadow_pte");
 
     if ( level == 2 ) sh_l2_present++;
     if ( level == 1 ) sh_l1_present++;
 
-    if ( !(l1e_get_flags(gpte) & _PAGE_PRESENT) )
+    if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte )
+        eff_guest_pte = snapshot_pte;
+    else
+        eff_guest_pte = guest_pte;
+
+    if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) )
         FAIL("Guest not present yet shadow is");
 
-    mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|PAGE_MASK);
+    mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK);
 
-    if ( (l1e_get_value(spte) & mask) != (l1e_get_value(gpte) & mask) )
+    if ( ((l1e_get_value(shadow_pte) & mask) != (l1e_get_value(eff_guest_pte) & mask)) )
         FAIL("Corrupt?");
 
     if ( (level == 1) &&
-         (l1e_get_flags(spte) & _PAGE_DIRTY) &&
-         !(l1e_get_flags(gpte) & _PAGE_DIRTY) && !oos_ptes )
+         (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) &&
+         !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) )
         FAIL("Dirty coherence");
 
-    if ( (l1e_get_flags(spte) & _PAGE_ACCESSED) &&
-         !(l1e_get_flags(gpte) & _PAGE_ACCESSED) && !oos_ptes )
+    if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) &&
+         !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) )
         FAIL("Accessed coherence");
 
-    if ( l1e_get_flags(spte) & _PAGE_GLOBAL )
+    if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL )
         FAIL("global bit set in shadow");
 
-    smfn = l1e_get_pfn(spte);
-    gpfn = l1e_get_pfn(gpte);
-    gmfn = __gpfn_to_mfn(d, gpfn);
+    eff_guest_pfn = l1e_get_pfn(eff_guest_pte);
+    eff_guest_mfn = __gpfn_to_mfn(d, eff_guest_pfn);
+    shadow_mfn = l1e_get_pfn(shadow_pte);
+
+    if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) )
+        FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%lx\n", __func__, eff_guest_pfn,
+             l1e_get_value(eff_guest_pte));
 
-    if ( !VALID_MFN(gmfn) )
-        FAIL("%s: invalid gpfn=%lx gpte=%lx\n", __func__, gpfn,
-             l1e_get_value(gpte));
+    page_table_page = mfn_is_page_table(eff_guest_mfn);
 
-    page_table_page = mfn_is_page_table(gmfn);
+    guest_writable =
+        (l1e_get_flags(eff_guest_pte) & _PAGE_RW) ||
+        (VM_ASSIST(d, VMASST_TYPE_writable_pagetables) && (level == 1) && mfn_out_of_sync(eff_guest_mfn));
 
-    if ( (l1e_get_flags(spte) & _PAGE_RW ) &&
-         !(l1e_get_flags(gpte) & _PAGE_RW) && !oos_ptes )
+    if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable )
     {
-        printk("gpfn=%lx gmfn=%lx smfn=%lx t=0x%08x page_table_page=%d "
-               "oos_ptes=%d\n",
-               gpfn, gmfn, smfn,
-               frame_table[gmfn].u.inuse.type_info,
-               page_table_page, oos_ptes);
+        printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08x page_table_page=%d\n",
+               eff_guest_pfn, eff_guest_mfn, shadow_mfn,
+               frame_table[eff_guest_mfn].u.inuse.type_info,
+               page_table_page);
         FAIL("RW coherence");
     }
 
     if ( (level == 1) &&
-         (l1e_get_flags(spte) & _PAGE_RW ) &&
-         !((l1e_get_flags(gpte) & _PAGE_RW) &&
-           (l1e_get_flags(gpte) & _PAGE_DIRTY)) &&
-         !oos_ptes )
-    {
-        printk("gpfn=%lx gmfn=%lx smfn=%lx t=0x%08x page_table_page=%d "
-               "oos_ptes=%d\n",
-               gpfn, gmfn, smfn,
-               frame_table[gmfn].u.inuse.type_info,
-               page_table_page, oos_ptes);
+         (l1e_get_flags(shadow_pte) & _PAGE_RW ) &&
+         !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) )
+    {
+        printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08x page_table_page=%d\n",
+               eff_guest_pfn, eff_guest_mfn, shadow_mfn,
+               frame_table[eff_guest_mfn].u.inuse.type_info,
+               page_table_page);
         FAIL("RW2 coherence");
     }
  
-    if ( gmfn == smfn )
+    if ( eff_guest_mfn == shadow_mfn )
     {
         if ( level > 1 )
             FAIL("Linear map ???");    /* XXX this will fail on BSD */
@@ -2788,9 +3079,9 @@ static int check_pte(
 
         if ( level == 2 )
         {
-            if ( __shadow_status(d, gpfn, PGT_l1_shadow) != smfn )
-                FAIL("smfn problem gpfn=%lx smfn=%lx", gpfn,
-                     __shadow_status(d, gpfn, PGT_l1_shadow));
+            if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn )
+                FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn,
+                     __shadow_status(d, eff_guest_pfn, PGT_l1_shadow));
         }
         else
             BUG(); // XXX -- not handled yet.
@@ -2807,24 +3098,29 @@ static int check_l1_table(
 {
     struct domain *d = ed->domain;
     int i;
-    l1_pgentry_t *gpl1e, *spl1e;
-    int errors = 0, oos_ptes = 0;
+    unsigned long snapshot_mfn;
+    l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL;
+    int errors = 0;
 
     if ( page_out_of_sync(pfn_to_page(gmfn)) )
     {
-        gmfn = __shadow_status(d, gpfn, PGT_snapshot);
-        oos_ptes = 1;
-        ASSERT(gmfn);
+        snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
+        ASSERT(snapshot_mfn);
+        p_snapshot = map_domain_mem(snapshot_mfn << PAGE_SHIFT);
     }
 
-    gpl1e = map_domain_mem(gmfn << PAGE_SHIFT);
-    spl1e = map_domain_mem(smfn << PAGE_SHIFT);
+    p_guest  = map_domain_mem(gmfn << PAGE_SHIFT);
+    p_shadow = map_domain_mem(smfn << PAGE_SHIFT);
 
     for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-        errors += check_pte(ed, &gpl1e[i], &spl1e[i], 1, l2_idx, i, oos_ptes);
+        errors += check_pte(ed, p_guest+i, p_shadow+i,
+                            p_snapshot ? p_snapshot+i : NULL,
+                            1, l2_idx, i);
  
-    unmap_domain_mem(spl1e);
-    unmap_domain_mem(gpl1e);
+    unmap_domain_mem(p_shadow);
+    unmap_domain_mem(p_guest);
+    if ( p_snapshot )
+        unmap_domain_mem(p_snapshot);
 
     return errors;
 }
@@ -2909,7 +3205,8 @@ int check_l2_table(
         errors += check_pte(ed,
                             (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
                             (l1_pgentry_t*)(&spl2e[i]),
-                            2, i, 0, 0);
+                            NULL,
+                            2, i, 0);
 
     unmap_domain_mem(spl2e);
     unmap_domain_mem(gpl2e);
index 948dd1802bc26c5aff5afc3fe6b3098a9e9a1679..4f87b7f96f7648faefd0db57ba41e21a4396d335 100644 (file)
@@ -270,7 +270,8 @@ asmlinkage int do_page_fault(struct xen_regs *regs)
 
     perfc_incrc(page_faults);
 
-    if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
+    if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
+                !shadow_mode_enabled(d)) )
     {
         LOCK_BIGLOCK(d);
         if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) &&
@@ -286,8 +287,6 @@ asmlinkage int do_page_fault(struct xen_regs *regs)
              ((regs->error_code & 3) == 3) && /* write-protection fault */
              ptwr_do_page_fault(d, addr) )
         {
-            if ( unlikely(shadow_mode_enabled(d)) )
-                (void)shadow_fault(addr, regs);
             UNLOCK_BIGLOCK(d);
             return EXCRET_fault_fixed;
         }
index db82c73eac5ca50f7955a7df25b553025ab4c86f..b677b9dc2430a165feac322a941fbbe8e83fa086 100644 (file)
@@ -672,7 +672,7 @@ static int vmx_set_cr0(unsigned long value)
                         d->arch.arch_vmx.cpu_cr3);
             domain_crash_synchronous(); /* need to take a clean path */
         }
-        old_base_mfn = pagetable_val(d->arch.guest_table) >> PAGE_SHIFT;
+        old_base_mfn = pagetable_get_pfn(d->arch.guest_table);
         if (old_base_mfn)
             put_page(pfn_to_page(old_base_mfn));
 
@@ -798,7 +798,7 @@ static int mov_to_cr(int gp, int cr, struct xen_regs *regs)
                         "Invalid CR3 value=%lx", value);
                 domain_crash_synchronous(); /* need to take a clean path */
             }
-            old_base_mfn = pagetable_val(d->arch.guest_table) >> PAGE_SHIFT;
+            old_base_mfn = pagetable_get_pfn(d->arch.guest_table);
             d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
             if (old_base_mfn)
                 put_page(pfn_to_page(old_base_mfn));
index 014fa7e45dc9fd5737ff012792a6cef8d2b4f26f..cf4e88bc6d8d45177782ae786203fa2ebec6a432 100644 (file)
@@ -150,7 +150,7 @@ extern void invalidate_shadow_ldt(struct exec_domain *d);
 extern int shadow_remove_all_write_access(
     struct domain *d, unsigned long gpfn, unsigned long gmfn);
 extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn);
-extern int _shadow_mode_enabled(struct domain *d);
+extern int _shadow_mode_refcounts(struct domain *d);
 
 static inline void put_page(struct pfn_info *page)
 {
@@ -182,7 +182,7 @@ static inline int get_page(struct pfn_info *page,
              unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
              unlikely(d != _domain) )                /* Wrong owner? */
         {
-            if ( !_shadow_mode_enabled(domain) )
+            if ( !_shadow_mode_refcounts(domain) )
                 DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%08x\n",
                         page_to_pfn(page), domain, unpickle_domptr(d),
                         x, page->u.inuse.type_info);
@@ -315,14 +315,21 @@ int  ptwr_init(struct domain *);
 void ptwr_destroy(struct domain *);
 void ptwr_flush(struct domain *, const int);
 int  ptwr_do_page_fault(struct domain *, unsigned long);
+int  revalidate_l1(struct domain *, l1_pgentry_t *, l1_pgentry_t *);
 
 #define cleanup_writable_pagetable(_d)                                      \
     do {                                                                    \
-        if ( unlikely(VM_ASSIST((_d), VMASST_TYPE_writable_pagetables)) ) { \
-            if ( (_d)->arch.ptwr[PTWR_PT_ACTIVE].l1va )                     \
-                ptwr_flush((_d), PTWR_PT_ACTIVE);                           \
-            if ( (_d)->arch.ptwr[PTWR_PT_INACTIVE].l1va )                   \
-                ptwr_flush((_d), PTWR_PT_INACTIVE);                         \
+        if ( likely(VM_ASSIST((_d), VMASST_TYPE_writable_pagetables)) )     \
+        {                                                                   \
+            if ( likely(!shadow_mode_enabled(_d)) )                         \
+            {                                                               \
+                if ( (_d)->arch.ptwr[PTWR_PT_ACTIVE].l1va )                 \
+                    ptwr_flush((_d), PTWR_PT_ACTIVE);                       \
+                if ( (_d)->arch.ptwr[PTWR_PT_INACTIVE].l1va )               \
+                    ptwr_flush((_d), PTWR_PT_INACTIVE);                     \
+            }                                                               \
+            else                                                            \
+                shadow_sync_all(_d);                                        \
         }                                                                   \
     } while ( 0 )
 
@@ -330,9 +337,9 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy);
 
 #ifndef NDEBUG
 
-#define AUDIT_ALREADY_LOCKED ( 1u << 0 )
-#define AUDIT_ERRORS_OK      ( 1u << 1 )
-#define AUDIT_QUIET          ( 1u << 2 )
+#define AUDIT_SHADOW_ALREADY_LOCKED ( 1u << 0 )
+#define AUDIT_ERRORS_OK             ( 1u << 1 )
+#define AUDIT_QUIET                 ( 1u << 2 )
 
 void _audit_domain(struct domain *d, int flags);
 #define audit_domain(_d) _audit_domain((_d), AUDIT_ERRORS_OK)
index a925f90fc810de020232fd033101fdb93101a891..59e582e4cdbf77f3f8d7e82202b1a2af2eb42e9a 100644 (file)
@@ -23,6 +23,7 @@
 #ifndef __ASSEMBLY__
 typedef struct { unsigned long pt_lo; } pagetable_t;
 #define pagetable_val(_x)   ((_x).pt_lo)
+#define pagetable_get_pfn(_x) ((_x).pt_lo >> PAGE_SHIFT)
 #define mk_pagetable(_x)    ( (pagetable_t) { (_x) } )
 #endif
 
@@ -103,6 +104,7 @@ extern void paging_init(void);
 #define _PAGE_PAT      0x080UL
 #define _PAGE_PSE      0x080UL
 #define _PAGE_GLOBAL   0x100UL
+#define _PAGE_AVAIL    0xe00UL
 
 #define __PAGE_HYPERVISOR \
     (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
index 88ea9e6dac94971d335c9a22328389c9bee90b45..146b75ed9d5c722ebbe94ffd658e2b51a94c5ed4 100644 (file)
 /* Shadow PT operation mode : shadow-mode variable in arch_domain. */
 
 #define SHM_enable    (1<<0) /* we're in one of the shadow modes */
-#define SHM_log_dirty (1<<1) /* enable log dirty mode */
-#define SHM_translate (1<<2) /* do p2m tranaltion on guest tables */
-#define SHM_external  (1<<3) /* external page table, not used by Xen */
+#define SHM_refcounts (1<<1) /* refcounts based on shadow tables instead of
+                                guest tables */
+#define SHM_write_all (1<<2) /* allow write access to all guest pt pages,
+                                regardless of pte write permissions */
+#define SHM_log_dirty (1<<3) /* enable log dirty mode */
+#define SHM_translate (1<<4) /* do p2m tranaltion on guest tables */
+#define SHM_external  (1<<5) /* external page table, not used by Xen */
 
 #define shadow_mode_enabled(_d)   ((_d)->arch.shadow_mode)
+#define shadow_mode_refcounts(_d) ((_d)->arch.shadow_mode & SHM_refcounts)
+#define shadow_mode_write_all(_d) ((_d)->arch.shadow_mode & SHM_write_all)
 #define shadow_mode_log_dirty(_d) ((_d)->arch.shadow_mode & SHM_log_dirty)
 #define shadow_mode_translate(_d) ((_d)->arch.shadow_mode & SHM_translate)
 #define shadow_mode_external(_d)  ((_d)->arch.shadow_mode & SHM_external)
@@ -72,7 +78,29 @@ extern void free_monitor_pagetable(struct exec_domain *ed);
 extern void __shadow_sync_all(struct domain *d);
 extern int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va);
 extern int set_p2m_entry(
-    struct domain *d, unsigned long pfn, unsigned long mfn);
+    struct domain *d, unsigned long pfn, unsigned long mfn,
+    struct map_dom_mem_cache *l2cache,
+    struct map_dom_mem_cache *l1cache);
+extern void remove_shadow(struct domain *d, unsigned long gpfn, u32 stype);
+
+extern void shadow_l1_normal_pt_update(struct domain *d,
+                                       unsigned long pa, l1_pgentry_t l1e,
+                                       struct map_dom_mem_cache *cache);
+extern void shadow_l2_normal_pt_update(struct domain *d,
+                                       unsigned long pa, l2_pgentry_t l2e,
+                                       struct map_dom_mem_cache *cache);
+#ifdef __x86_64__
+extern void shadow_l3_normal_pt_update(struct domain *d,
+                                       unsigned long pa, l3_pgentry_t l3e,
+                                       struct map_dom_mem_cache *cache);
+extern void shadow_l4_normal_pt_update(struct domain *d,
+                                       unsigned long pa, l4_pgentry_t l4e,
+                                       struct map_dom_mem_cache *cache);
+#endif
+extern int shadow_do_update_va_mapping(unsigned long va,
+                                       l1_pgentry_t val,
+                                       struct exec_domain *ed);
+
 
 static inline unsigned long __shadow_status(
     struct domain *d, unsigned long gpfn, unsigned long stype);
@@ -82,7 +110,13 @@ extern void vmx_shadow_clear_state(struct domain *);
 
 static inline int page_is_page_table(struct pfn_info *page)
 {
-    return page->count_info & PGC_page_table;
+    struct domain *owner = page_get_owner(page);
+
+    if ( owner && shadow_mode_refcounts(owner) )
+        return page->count_info & PGC_page_table;
+
+    u32 type_info = page->u.inuse.type_info & PGT_type_mask;
+    return type_info && (type_info <= PGT_l4_page_table);
 }
 
 static inline int mfn_is_page_table(unsigned long mfn)
@@ -90,7 +124,7 @@ static inline int mfn_is_page_table(unsigned long mfn)
     if ( !pfn_valid(mfn) )
         return 0;
 
-    return frame_table[mfn].count_info & PGC_page_table;
+    return page_is_page_table(pfn_to_page(mfn));
 }
 
 static inline int page_out_of_sync(struct pfn_info *page)
@@ -103,7 +137,7 @@ static inline int mfn_out_of_sync(unsigned long mfn)
     if ( !pfn_valid(mfn) )
         return 0;
 
-    return frame_table[mfn].count_info & PGC_out_of_sync;
+    return page_out_of_sync(pfn_to_page(mfn));
 }
 
 
@@ -191,10 +225,12 @@ static inline void shadow_mode_disable(struct domain *d)
       : (mfn) )
 
 #define __gpfn_to_mfn(_d, gpfn)                        \
-    ( (shadow_mode_translate(_d))                      \
-      ? ({ ASSERT(current->domain == (_d));            \
-           phys_to_machine_mapping(gpfn); })           \
-      : (gpfn) )
+    ({                                                 \
+        ASSERT(current->domain == (_d));               \
+        (shadow_mode_translate(_d))                    \
+        ? phys_to_machine_mapping(gpfn)                \
+        : (gpfn);                                      \
+    })
 
 #define __gpfn_to_mfn_foreign(_d, gpfn)                \
     ( (shadow_mode_translate(_d))                      \
@@ -237,6 +273,8 @@ struct out_of_sync_entry {
 
 #if SHADOW_DEBUG
 extern int shadow_status_noswap;
+#define _SHADOW_REFLECTS_SNAPSHOT ( 9)
+#define SHADOW_REFLECTS_SNAPSHOT  (1u << _SHADOW_REFLECTS_SNAPSHOT)
 #endif
 
 #ifdef VERBOSE
@@ -292,15 +330,18 @@ shadow_get_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
 
     ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT);
 
+    if ( !shadow_mode_refcounts(d) )
+        return 1;
+
     nl1e = l1e;
     l1e_remove_flags(&nl1e, _PAGE_GLOBAL);
     res = get_page_from_l1e(nl1e, d);
 
     if ( unlikely(!res) && IS_PRIV(d) && !shadow_mode_translate(d) &&
-         !(l1e_get_flags(l1e) & L1_DISALLOW_MASK) &&
-         (mfn = l1e_get_pfn(l1e)) &&
+         !(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) &&
+         (mfn = l1e_get_pfn(nl1e)) &&
          pfn_valid(mfn) &&
-         (owner = page_get_owner(pfn_to_page(l1e_get_pfn(l1e)))) &&
+         (owner = page_get_owner(pfn_to_page(mfn))) &&
          (d != owner) )
     {
         res = get_page_from_l1e(nl1e, owner);
@@ -319,6 +360,103 @@ shadow_get_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
     return res;
 }
 
+static inline void
+shadow_put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
+{
+    if ( !shadow_mode_refcounts(d) )
+        return;
+
+    put_page_from_l1e(l1e, d);
+}
+
+static inline void
+shadow_put_page_type(struct domain *d, struct pfn_info *page)
+{
+    if ( !shadow_mode_refcounts(d) )
+        return;
+
+    put_page_type(page);
+}
+
+static inline int shadow_get_page(struct domain *d,
+                                  struct pfn_info *page,
+                                  struct domain *owner)
+{
+    if ( !shadow_mode_refcounts(d) )
+        return 1;
+    return get_page(page, owner);
+}
+
+static inline void shadow_put_page(struct domain *d,
+                                   struct pfn_info *page)
+{
+    if ( !shadow_mode_refcounts(d) )
+        return;
+    put_page(page);
+}
+
+/************************************************************************/
+
+static inline int __mark_dirty(struct domain *d, unsigned int mfn)
+{
+    unsigned long pfn;
+    int           rc = 0;
+
+    ASSERT(spin_is_locked(&d->arch.shadow_lock));
+    ASSERT(d->arch.shadow_dirty_bitmap != NULL);
+
+    if ( !VALID_MFN(mfn) )
+        return rc;
+
+    // N.B. This doesn't use __mfn_to_gpfn().
+    // This wants the nice compact set of PFNs from 0..domain's max,
+    // which __mfn_to_gpfn() only returns for translated domains.
+    //
+    pfn = machine_to_phys_mapping[mfn];
+
+    /*
+     * Values with the MSB set denote MFNs that aren't really part of the 
+     * domain's pseudo-physical memory map (e.g., the shared info frame).
+     * Nothing to do here...
+     */
+    if ( unlikely(IS_INVALID_M2P_ENTRY(pfn)) )
+        return rc;
+
+    if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) )
+    {
+        /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
+        if ( !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) )
+        {
+            d->arch.shadow_dirty_count++;
+            rc = 1;
+        }
+    }
+#ifndef NDEBUG
+    else if ( mfn < max_page )
+    {
+        SH_LOG("mark_dirty OOR! mfn=%x pfn=%lx max=%x (dom %p)",
+               mfn, pfn, d->arch.shadow_dirty_bitmap_size, d);
+        SH_LOG("dom=%p caf=%08x taf=%08x", 
+               page_get_owner(&frame_table[mfn]),
+               frame_table[mfn].count_info, 
+               frame_table[mfn].u.inuse.type_info );
+    }
+#endif
+
+    return rc;
+}
+
+
+static inline int mark_dirty(struct domain *d, unsigned int mfn)
+{
+    int rc;
+    shadow_lock(d);
+    rc = __mark_dirty(d, mfn);
+    shadow_unlock(d);
+    return rc;
+}
+
+
 /************************************************************************/
 
 static inline void
@@ -350,10 +488,15 @@ static inline void
 __guest_set_l2e(
     struct exec_domain *ed, unsigned long va, l2_pgentry_t value)
 {
+    struct domain *d = ed->domain;
+
     ed->arch.guest_vtable[l2_table_offset(va)] = value;
 
-    if ( unlikely(shadow_mode_translate(ed->domain)) )
+    if ( unlikely(shadow_mode_translate(d)) )
         update_hl2e(ed, va);
+
+    if ( unlikely(shadow_mode_log_dirty(d)) )
+        __mark_dirty(d, pagetable_get_pfn(ed->arch.guest_table));
 }
 
 static inline void
@@ -380,11 +523,12 @@ update_hl2e(struct exec_domain *ed, unsigned long va)
     if ( (l1e_has_changed(&old_hl2e, &new_hl2e, _PAGE_PRESENT)) )
     {
         if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) &&
-             !get_page(pfn_to_page(l1e_get_pfn(new_hl2e)), ed->domain) )
+             !shadow_get_page(ed->domain, pfn_to_page(l1e_get_pfn(new_hl2e)),
+                              ed->domain) )
             new_hl2e = l1e_empty();
         if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT )
         {
-            put_page(pfn_to_page(l1e_get_pfn(old_hl2e)));
+            shadow_put_page(ed->domain, pfn_to_page(l1e_get_pfn(old_hl2e)));
             need_flush = 1;
         }
     }
@@ -401,7 +545,7 @@ update_hl2e(struct exec_domain *ed, unsigned long va)
 static inline void shadow_drop_references(
     struct domain *d, struct pfn_info *page)
 {
-    if ( likely(!shadow_mode_enabled(d)) ||
+    if ( likely(!shadow_mode_refcounts(d)) ||
          ((page->u.inuse.type_info & PGT_count_mask) == 0) )
         return;
 
@@ -423,7 +567,7 @@ static inline void shadow_drop_references(
 static inline void shadow_sync_and_drop_references(
     struct domain *d, struct pfn_info *page)
 {
-    if ( likely(!shadow_mode_enabled(d)) )
+    if ( likely(!shadow_mode_refcounts(d)) )
         return;
 
     shadow_lock(d);
@@ -520,64 +664,6 @@ shadow_unpin(unsigned long smfn)
 }
 
 
-/************************************************************************/
-
-static inline int __mark_dirty(struct domain *d, unsigned int mfn)
-{
-    unsigned long pfn;
-    int           rc = 0;
-
-    ASSERT(spin_is_locked(&d->arch.shadow_lock));
-    ASSERT(d->arch.shadow_dirty_bitmap != NULL);
-
-    if ( !VALID_MFN(mfn) )
-        return rc;
-
-    pfn = __mfn_to_gpfn(d, mfn);
-
-    /*
-     * Values with the MSB set denote MFNs that aren't really part of the 
-     * domain's pseudo-physical memory map (e.g., the shared info frame).
-     * Nothing to do here...
-     */
-    if ( unlikely(IS_INVALID_M2P_ENTRY(pfn)) )
-        return rc;
-
-    if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) )
-    {
-        /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
-        if ( !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) )
-        {
-            d->arch.shadow_dirty_count++;
-            rc = 1;
-        }
-    }
-#ifndef NDEBUG
-    else if ( mfn < max_page )
-    {
-        SH_LOG("mark_dirty OOR! mfn=%x pfn=%lx max=%x (dom %p)",
-               mfn, pfn, d->arch.shadow_dirty_bitmap_size, d);
-        SH_LOG("dom=%p caf=%08x taf=%08x\n", 
-               page_get_owner(&frame_table[mfn]),
-               frame_table[mfn].count_info, 
-               frame_table[mfn].u.inuse.type_info );
-    }
-#endif
-
-    return rc;
-}
-
-
-static inline int mark_dirty(struct domain *d, unsigned int mfn)
-{
-    int rc;
-    shadow_lock(d);
-    rc = __mark_dirty(d, mfn);
-    shadow_unlock(d);
-    return rc;
-}
-
-
 /************************************************************************/
 
 extern void shadow_mark_va_out_of_sync(
@@ -666,8 +752,10 @@ static inline void l1pte_propagate_from_guest(
           (_PAGE_PRESENT|_PAGE_ACCESSED)) &&
          VALID_MFN(mfn = __gpfn_to_mfn(d, l1e_get_pfn(gpte))) )
     {
-        spte = l1e_create_pfn(mfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-        
+        spte = l1e_create_pfn(mfn,
+                              l1e_get_flags(gpte) &
+                              ~(_PAGE_GLOBAL | _PAGE_AVAIL));
+
         if ( shadow_mode_log_dirty(d) ||
              !(l1e_get_flags(gpte) & _PAGE_DIRTY) ||
              mfn_is_page_table(mfn) )
@@ -729,14 +817,13 @@ static inline void l2pde_general(
     spde = l2e_empty();
     if ( (l2e_get_flags(gpde) & _PAGE_PRESENT) && (sl1mfn != 0) )
     {
-        spde = l2e_create_pfn(sl1mfn, 
-                              l2e_get_flags(gpde) | _PAGE_RW | _PAGE_ACCESSED);
-        l2e_add_flags(&gpde, _PAGE_ACCESSED); /* N.B. PDEs do not have a dirty bit. */
+        spde = l2e_create_pfn(sl1mfn,
+                              (l2e_get_flags(gpde) | _PAGE_RW | _PAGE_ACCESSED)
+                              & ~(_PAGE_AVAIL));
+
+        /* N.B. PDEs do not have a dirty bit. */
+        l2e_add_flags(&gpde, _PAGE_ACCESSED);
 
-        // XXX mafetter: Hmm...
-        //     Shouldn't the dirty log be checked/updated here?
-        //     Actually, it needs to be done in this function's callers.
-        //
         *gpde_p = gpde;
     }
 
@@ -769,34 +856,57 @@ validate_pte_change(
     l1_pgentry_t *shadow_pte_p)
 {
     l1_pgentry_t old_spte, new_spte;
+    int need_flush = 0;
 
     perfc_incrc(validate_pte_calls);
 
-#if 0
-    FSH_LOG("validate_pte(old=%lx new=%lx)", old_pte, new_pte);
-#endif
-
-    old_spte = *shadow_pte_p;
     l1pte_propagate_from_guest(d, new_pte, &new_spte);
 
-    // only do the ref counting if something important changed.
-    //
-    if ( ((l1e_get_value(old_spte) | l1e_get_value(new_spte)) & _PAGE_PRESENT ) &&
-         l1e_has_changed(&old_spte, &new_spte, _PAGE_RW | _PAGE_PRESENT) )
+    if ( shadow_mode_refcounts(d) )
     {
-        perfc_incrc(validate_pte_changes);
+        old_spte = *shadow_pte_p;
 
-        if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
-             !shadow_get_page_from_l1e(new_spte, d) )
-            new_spte = l1e_empty();
-        if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
-            put_page_from_l1e(old_spte, d);
+        if ( l1e_get_value(old_spte) == l1e_get_value(new_spte) )
+        {
+            // No accounting required...
+            //
+            perfc_incrc(validate_pte_changes1);
+        }
+        else if ( l1e_get_value(old_spte) == (l1e_get_value(new_spte)|_PAGE_RW) )
+        {
+            // Fast path for PTEs that have merely been write-protected
+            // (e.g., during a Unix fork()). A strict reduction in privilege.
+            //
+            perfc_incrc(validate_pte_changes2);
+            if ( likely(l1e_get_flags(new_spte) & _PAGE_PRESENT) )
+                shadow_put_page_type(d, &frame_table[l1e_get_pfn(new_spte)]);
+        }
+        else if ( ((l1e_get_flags(old_spte) | l1e_get_flags(new_spte)) &
+                   _PAGE_PRESENT ) &&
+                  l1e_has_changed(&old_spte, &new_spte, _PAGE_RW | _PAGE_PRESENT) )
+        {
+            // only do the ref counting if something important changed.
+            //
+            perfc_incrc(validate_pte_changes3);
+
+            if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
+                 !shadow_get_page_from_l1e(new_spte, d) )
+                new_spte = l1e_empty();
+            if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
+            {
+                shadow_put_page_from_l1e(old_spte, d);
+                need_flush = 1;
+            }
+        }
+        else
+        {
+            perfc_incrc(validate_pte_changes4);
+        }
     }
 
     *shadow_pte_p = new_spte;
 
-    // paranoia rules!
-    return 1;
+    return need_flush;
 }
 
 // returns true if a tlb flush is needed
@@ -808,6 +918,7 @@ validate_hl2e_change(
     l1_pgentry_t *shadow_hl2e_p)
 {
     l1_pgentry_t old_hl2e, new_hl2e;
+    int need_flush = 0;
 
     perfc_incrc(validate_hl2e_calls);
 
@@ -825,14 +936,15 @@ validate_hl2e_change(
              !get_page(pfn_to_page(l1e_get_pfn(new_hl2e)), d) )
             new_hl2e = l1e_empty();
         if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT )
+        {
             put_page(pfn_to_page(l1e_get_pfn(old_hl2e)));
+            need_flush = 1;
+        }
     }
 
     *shadow_hl2e_p = new_hl2e;
 
-    // paranoia rules!
-    return 1;
-    
+    return need_flush;
 }
 
 // returns true if a tlb flush is needed
@@ -844,15 +956,13 @@ validate_pde_change(
     l2_pgentry_t *shadow_pde_p)
 {
     l2_pgentry_t old_spde, new_spde;
+    int need_flush = 0;
 
     perfc_incrc(validate_pde_calls);
 
     old_spde = *shadow_pde_p;
     l2pde_propagate_from_guest(d, &new_gpde, &new_spde);
 
-    // XXX Shouldn't we propagate the new_gpde to the guest?
-    // And then mark the guest's L2 page as dirty?
-
     // Only do the ref counting if something important changed.
     //
     if ( ((l2e_get_value(old_spde) | l2e_get_value(new_spde)) & _PAGE_PRESENT) &&
@@ -864,13 +974,15 @@ validate_pde_change(
              !get_shadow_ref(l2e_get_pfn(new_spde)) )
             BUG();
         if ( l2e_get_flags(old_spde) & _PAGE_PRESENT )
+        {
             put_shadow_ref(l2e_get_pfn(old_spde));
+            need_flush = 1;
+        }
     }
 
     *shadow_pde_p = new_spde;
 
-    // paranoia rules!
-    return 1;
+    return need_flush;
 }
 
 /*********************************************************************/
@@ -1035,10 +1147,19 @@ static inline unsigned long __shadow_status(
     {
         perfc_incrc(shadow_status_shortcut);
 #ifndef NDEBUG
-        ASSERT(___shadow_status(d, gpfn, stype) == 0);
+        if ( ___shadow_status(d, gpfn, stype) != 0 )
+        {
+            printk("d->id=%d gpfn=%lx gmfn=%lx stype=%lx c=%x t=%x "
+                   "mfn_out_of_sync(gmfn)=%d mfn_is_page_table(gmfn)=%d\n",
+                   d->id, gpfn, gmfn, stype,
+                   frame_table[gmfn].count_info,
+                   frame_table[gmfn].u.inuse.type_info,
+                   mfn_out_of_sync(gmfn), mfn_is_page_table(gmfn));
+            BUG();
+        }
 
-        // Undo the affects of the above ASSERT on ___shadow_status()'s perf
-        // counters.
+        // Undo the affects of the above call to ___shadow_status()'s perf
+        // counters, since that call is really just part of an assertion.
         //
         perfc_decrc(shadow_status_calls);
         perfc_decrc(shadow_status_miss);
@@ -1056,12 +1177,12 @@ static inline unsigned long __shadow_status(
  *
  * Either returns PGT_none, or PGT_l{1,2,3,4}_page_table.
  */
-static inline unsigned long
+static inline u32
 shadow_max_pgtable_type(struct domain *d, unsigned long gpfn,
                         unsigned long *smfn)
 {
     struct shadow_status *x;
-    unsigned long pttype = PGT_none, type;
+    u32 pttype = PGT_none, type;
 
     ASSERT(spin_is_locked(&d->arch.shadow_lock));
     ASSERT(gpfn == (gpfn & PGT_mfn_mask));
@@ -1379,7 +1500,6 @@ shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
     struct exec_domain *ed = current;
     struct domain *d = ed->domain;
     l2_pgentry_t sl2e;
-    l1_pgentry_t old_spte;
 
 #if 0
     printk("shadow_set_l1e(va=%p, new_spte=%p, create=%d)\n",
@@ -1424,17 +1544,20 @@ shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
         }
     }
 
-    old_spte = shadow_linear_pg_table[l1_linear_offset(va)];
-
-    // only do the ref counting if something important changed.
-    //
-    if ( l1e_has_changed(&old_spte, &new_spte, _PAGE_RW | _PAGE_PRESENT) )
+    if ( shadow_mode_refcounts(d) )
     {
-        if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
-             !shadow_get_page_from_l1e(new_spte, d) )
-            new_spte = l1e_empty();
-        if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
-            put_page_from_l1e(old_spte, d);
+        l1_pgentry_t old_spte = shadow_linear_pg_table[l1_linear_offset(va)];
+
+        // only do the ref counting if something important changed.
+        //
+        if ( l1e_has_changed(&old_spte, &new_spte, _PAGE_RW | _PAGE_PRESENT) )
+        {
+            if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
+                 !shadow_get_page_from_l1e(new_spte, d) )
+                new_spte = l1e_empty();
+            if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
+                shadow_put_page_from_l1e(old_spte, d);
+        }
     }
 
     shadow_linear_pg_table[l1_linear_offset(va)] = new_spte;
@@ -1444,6 +1567,27 @@ shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
 
 /************************************************************************/
 
+static inline int
+shadow_mode_page_writable(struct domain *d, unsigned long gpfn)
+{
+    unsigned long mfn = __gpfn_to_mfn(d, gpfn);
+    u32 type = frame_table[mfn].u.inuse.type_info & PGT_type_mask;
+
+    if ( shadow_mode_refcounts(d) &&
+         (type == PGT_writable_page) )
+        type = shadow_max_pgtable_type(d, gpfn, NULL);
+
+    if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
+         (type == PGT_l1_page_table) )
+        return 1;
+
+    if ( shadow_mode_write_all(d) &&
+         type && (type <= PGT_l4_page_table) )
+        return 1;
+
+    return 0;
+}
+
 static inline l1_pgentry_t gva_to_gpte(unsigned long gva)
 {
     l2_pgentry_t gpde;
index f72bce7936b380b96233c00503ed5a90ae9cf51a..0220de530e8e4aa407362e1aa35685048141f8c9 100644 (file)
@@ -26,4 +26,51 @@ extern void *map_domain_mem(unsigned long pa);
  */
 extern void unmap_domain_mem(void *va);
 
+struct map_dom_mem_cache {
+    unsigned long pa;
+    void *va;
+};
+
+#define MAP_DOM_MEM_CACHE_INIT { .pa = 0 }
+
+static inline void *
+map_domain_mem_with_cache(unsigned long pa,
+                          struct map_dom_mem_cache *cache)
+{
+    if ( likely(cache != NULL) )
+    {
+        if ( likely(cache->pa) )
+        {
+            if ( likely((pa & PAGE_MASK) == (cache->pa & PAGE_MASK)) )
+                goto done;
+            unmap_domain_mem(cache->va);
+        }
+        cache->pa = (pa & PAGE_MASK) | 1;
+        cache->va = map_domain_mem(cache->pa);
+    done:
+        return (void *)(((unsigned long)cache->va & PAGE_MASK) |
+                        (pa & ~PAGE_MASK));
+    }
+
+    return map_domain_mem(pa);
+}
+
+static inline void
+unmap_domain_mem_with_cache(void *va,
+                            struct map_dom_mem_cache *cache)
+{
+    if ( unlikely(!cache) )
+        unmap_domain_mem(va);
+}
+
+static inline void
+unmap_domain_mem_cache(struct map_dom_mem_cache *cache)
+{
+    if ( likely(cache != NULL) && likely(cache->pa) )
+    {
+        unmap_domain_mem(cache->va);
+        cache->pa = 0;
+    }
+}
+
 #endif /* __ASM_DOMAIN_PAGE_H__ */
index 313635f82a52abd7410fa90f8f11e8373b3d032a..91c34ce2325a3c7774e986ca6392f44cb16cb3eb 100644 (file)
@@ -15,7 +15,7 @@
 #define BUG_ON(_p) do { if (_p) BUG(); } while ( 0 )
 
 #ifndef NDEBUG
-#define ASSERT(_p) if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s\n", #_p , __LINE__, __FILE__); BUG(); }
+#define ASSERT(_p) { if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s\n", #_p , __LINE__, __FILE__); BUG(); } }
 #else
 #define ASSERT(_p) ((void)0)
 #endif
index 501851ec55d147dbecb4a05b189029ef97f4570c..c7f2493a1a14f96cafda20cff19ae11ead6b1ab9 100644 (file)
@@ -86,12 +86,14 @@ PERFCOUNTER_CPU(resync_hl2,                        "resync HL2 page")
 PERFCOUNTER_CPU(shadow_make_snapshot,              "snapshots created")
 PERFCOUNTER_CPU(shadow_mark_mfn_out_of_sync_calls, "calls to shadow_mk_out_of_sync")
 PERFCOUNTER_CPU(shadow_out_of_sync_calls,          "calls to shadow_out_of_sync")
-PERFCOUNTER_CPU(extra_va_update_sync,              "extra syncs for bug in chk_pgtb")
 PERFCOUNTER_CPU(snapshot_entry_matches_calls,      "calls to ss_entry_matches")
 PERFCOUNTER_CPU(snapshot_entry_matches_true,       "ss_entry_matches returns true")
 
 PERFCOUNTER_CPU(validate_pte_calls,                "calls to validate_pte_change")
-PERFCOUNTER_CPU(validate_pte_changes,              "validate_pte makes changes")
+PERFCOUNTER_CPU(validate_pte_changes1,             "validate_pte makes changes1")
+PERFCOUNTER_CPU(validate_pte_changes2,             "validate_pte makes changes2")
+PERFCOUNTER_CPU(validate_pte_changes3,             "validate_pte makes changes3")
+PERFCOUNTER_CPU(validate_pte_changes4,             "validate_pte makes changes4")
 PERFCOUNTER_CPU(validate_pde_calls,                "calls to validate_pde_change")
 PERFCOUNTER_CPU(validate_pde_changes,              "validate_pde makes changes")
 PERFCOUNTER_CPU(shadow_get_page_fail,   "shadow_get_page_from_l1e fails" )